aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig80
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/Makefile.um2
-rw-r--r--arch/x86/boot/compressed/aslr.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S3
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/misc.c5
-rw-r--r--arch/x86/boot/compressed/misc.h6
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/video-mode.c4
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/boot/video.h1
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c187
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c15
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c15
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c9
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c15
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c7
-rw-r--r--arch/x86/crypto/glue_helper.c1
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c15
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c15
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c15
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c9
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c139
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S10
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S10
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S10
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c193
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S6
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S6
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S6
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c202
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S4
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c15
-rw-r--r--arch/x86/ia32/Makefile1
-rw-r--r--arch/x86/ia32/ia32_signal.c19
-rw-r--r--arch/x86/ia32/ia32entry.S485
-rw-r--r--arch/x86/ia32/nosyscall.c7
-rw-r--r--arch/x86/ia32/sys_ia32.c14
-rw-r--r--arch/x86/ia32/syscall_ia32.c25
-rw-r--r--arch/x86/include/asm/alternative-asm.h53
-rw-r--r--arch/x86/include/asm/alternative.h73
-rw-r--r--arch/x86/include/asm/apic.h3
-rw-r--r--arch/x86/include/asm/barrier.h6
-rw-r--r--arch/x86/include/asm/calling.h284
-rw-r--r--arch/x86/include/asm/compat.h2
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h42
-rw-r--r--arch/x86/include/asm/desc.h7
-rw-r--r--arch/x86/include/asm/dwarf2.h24
-rw-r--r--arch/x86/include/asm/e820.h8
-rw-r--r--arch/x86/include/asm/efi.h6
-rw-r--r--arch/x86/include/asm/elf.h11
-rw-r--r--arch/x86/include/asm/fpu-internal.h130
-rw-r--r--arch/x86/include/asm/hw_irq.h5
-rw-r--r--arch/x86/include/asm/insn.h2
-rw-r--r--arch/x86/include/asm/iommu_table.h11
-rw-r--r--arch/x86/include/asm/irqflags.h49
-rw-r--r--arch/x86/include/asm/jump_label.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h28
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/livepatch.h4
-rw-r--r--arch/x86/include/asm/mce.h16
-rw-r--r--arch/x86/include/asm/microcode.h73
-rw-r--r--arch/x86/include/asm/microcode_intel.h13
-rw-r--r--arch/x86/include/asm/mwait.h8
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h13
-rw-r--r--arch/x86/include/asm/paravirt_types.h8
-rw-r--r--arch/x86/include/asm/pgalloc.h8
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h8
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h1
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/pm-trace.h (renamed from arch/x86/include/asm/resume-trace.h)10
-rw-r--r--arch/x86/include/asm/processor.h110
-rw-r--r--arch/x86/include/asm/ptrace.h45
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/seccomp.h21
-rw-r--r--arch/x86/include/asm/seccomp_32.h11
-rw-r--r--arch/x86/include/asm/seccomp_64.h17
-rw-r--r--arch/x86/include/asm/segment.h289
-rw-r--r--arch/x86/include/asm/setup.h5
-rw-r--r--arch/x86/include/asm/sigcontext.h6
-rw-r--r--arch/x86/include/asm/sighandling.h4
-rw-r--r--arch/x86/include/asm/smap.h30
-rw-r--r--arch/x86/include/asm/smp.h3
-rw-r--r--arch/x86/include/asm/special_insns.h24
-rw-r--r--arch/x86/include/asm/thread_info.h77
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h1
-rw-r--r--arch/x86/include/uapi/asm/e820.h10
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h18
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h16
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h13
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h21
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c163
-rw-r--r--arch/x86/kernel/apic/apic.c62
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c89
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c126
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c715
-rw-r--r--arch/x86/kernel/cpu/intel_pt.h131
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c154
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c63
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c75
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c345
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c22
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event.c223
-rw-r--r--arch/x86/kernel/cpu/perf_event.h181
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c908
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c525
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c1379
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c39
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c321
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c1100
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c3
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/devicetree.c4
-rw-r--r--arch/x86/kernel/dumpstack.c15
-rw-r--r--arch/x86/kernel/dumpstack_32.c13
-rw-r--r--arch/x86/kernel/dumpstack_64.c11
-rw-r--r--arch/x86/kernel/e820.c28
-rw-r--r--arch/x86/kernel/early_printk.c32
-rw-r--r--arch/x86/kernel/entry_32.S93
-rw-r--r--arch/x86/kernel/entry_64.S978
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S3
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/i387.c58
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes/core.c13
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/perf_regs.c40
-rw-r--r--arch/x86/kernel/pmem.c53
-rw-r--r--arch/x86/kernel/process.c106
-rw-r--r--arch/x86/kernel/process_32.c27
-rw-r--r--arch/x86/kernel/process_64.c24
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S8
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S16
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/signal.c90
-rw-r--r--arch/x86/kernel/smpboot.c77
-rw-r--r--arch/x86/kernel/sys_x86_64.c30
-rw-r--r--arch/x86/kernel/syscall_32.c16
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c62
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c24
-rw-r--r--arch/x86/kernel/xsave.c39
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c33
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c193
-rw-r--r--arch/x86/kvm/i8254.c14
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c12
-rw-r--r--arch/x86/kvm/ioapic.c22
-rw-r--r--arch/x86/kvm/ioapic.h11
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c147
-rw-r--r--arch/x86/kvm/lapic.h17
-rw-r--r--arch/x86/kvm/mmu.c73
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c43
-rw-r--r--arch/x86/kvm/vmx.c146
-rw-r--r--arch/x86/kvm/x86.c171
-rw-r--r--arch/x86/lguest/boot.c4
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S50
-rw-r--r--arch/x86/lib/checksum_32.S64
-rw-r--r--arch/x86/lib/clear_page_64.S66
-rw-r--r--arch/x86/lib/copy_page_64.S37
-rw-r--r--arch/x86/lib/copy_user_64.S46
-rw-r--r--arch/x86/lib/csum-copy_64.S2
-rw-r--r--arch/x86/lib/insn.c13
-rw-r--r--arch/x86/lib/memcpy_64.S68
-rw-r--r--arch/x86/lib/memmove_64.S19
-rw-r--r--arch/x86/lib/memset_64.S61
-rw-r--r--arch/x86/lib/msr-reg.S24
-rw-r--r--arch/x86/lib/rwsem.S44
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S28
-rw-r--r--arch/x86/lib/x86-opcode-map.txt9
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c8
-rw-r--r--arch/x86/mm/init.c69
-rw-r--r--arch/x86/mm/init_64.c14
-rw-r--r--arch/x86/mm/ioremap.c23
-rw-r--r--arch/x86/mm/memtest.c118
-rw-r--r--arch/x86/mm/mmap.c38
-rw-r--r--arch/x86/mm/numa.c11
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/pat.c6
-rw-r--r--arch/x86/mm/pgtable.c160
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c4
-rw-r--r--arch/x86/platform/efi/efi.c17
-rw-r--r--arch/x86/platform/efi/efi_32.c22
-rw-r--r--arch/x86/platform/efi/efi_64.c29
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c10
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c4
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c4
-rw-r--r--arch/x86/platform/uv/tlb_uv.c6
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/syscalls/Makefile9
-rw-r--r--arch/x86/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/um/Makefile1
-rw-r--r--arch/x86/um/asm/barrier.h15
-rw-r--r--arch/x86/um/asm/elf.h2
-rw-r--r--arch/x86/um/ldt.c227
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_32.h3
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_64.h3
-rw-r--r--arch/x86/um/shared/sysdep/skas_ptrace.h22
-rw-r--r--arch/x86/um/signal.c7
-rw-r--r--arch/x86/um/sys_call_table_64.c2
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/vdso/vclock_gettime.c34
-rw-r--r--arch/x86/vdso/vdso32/syscall.S2
-rw-r--r--arch/x86/xen/apic.c180
-rw-r--r--arch/x86/xen/enlighten.c91
-rw-r--r--arch/x86/xen/mmu.c221
-rw-r--r--arch/x86/xen/smp.c60
-rw-r--r--arch/x86/xen/suspend.c11
-rw-r--r--arch/x86/xen/trace.c50
-rw-r--r--arch/x86/xen/xen-asm_64.S8
-rw-r--r--arch/x86/xen/xen-head.S63
260 files changed, 9747 insertions, 5373 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca55187..6049d587599e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -87,7 +87,7 @@ config X86
87 select HAVE_ARCH_KMEMCHECK 87 select HAVE_ARCH_KMEMCHECK
88 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP 88 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
89 select HAVE_USER_RETURN_NOTIFIER 89 select HAVE_USER_RETURN_NOTIFIER
90 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 90 select ARCH_HAS_ELF_RANDOMIZE
91 select HAVE_ARCH_JUMP_LABEL 91 select HAVE_ARCH_JUMP_LABEL
92 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 92 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
93 select SPARSE_IRQ 93 select SPARSE_IRQ
@@ -99,6 +99,7 @@ config X86
99 select IRQ_FORCED_THREADING 99 select IRQ_FORCED_THREADING
100 select HAVE_BPF_JIT if X86_64 100 select HAVE_BPF_JIT if X86_64
101 select HAVE_ARCH_TRANSPARENT_HUGEPAGE 101 select HAVE_ARCH_TRANSPARENT_HUGEPAGE
102 select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
102 select ARCH_HAS_SG_CHAIN 103 select ARCH_HAS_SG_CHAIN
103 select CLKEVT_I8253 104 select CLKEVT_I8253
104 select ARCH_HAVE_NMI_SAFE_CMPXCHG 105 select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -177,7 +178,7 @@ config SBUS
177 178
178config NEED_DMA_MAP_STATE 179config NEED_DMA_MAP_STATE
179 def_bool y 180 def_bool y
180 depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG 181 depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
181 182
182config NEED_SG_DMA_LENGTH 183config NEED_SG_DMA_LENGTH
183 def_bool y 184 def_bool y
@@ -235,12 +236,10 @@ config ARCH_WANT_GENERAL_HUGETLB
235 def_bool y 236 def_bool y
236 237
237config ZONE_DMA32 238config ZONE_DMA32
238 bool 239 def_bool y if X86_64
239 default X86_64
240 240
241config AUDIT_ARCH 241config AUDIT_ARCH
242 bool 242 def_bool y if X86_64
243 default X86_64
244 243
245config ARCH_SUPPORTS_OPTIMIZED_INLINING 244config ARCH_SUPPORTS_OPTIMIZED_INLINING
246 def_bool y 245 def_bool y
@@ -279,6 +278,12 @@ config ARCH_SUPPORTS_UPROBES
279config FIX_EARLYCON_MEM 278config FIX_EARLYCON_MEM
280 def_bool y 279 def_bool y
281 280
281config PGTABLE_LEVELS
282 int
283 default 4 if X86_64
284 default 3 if X86_PAE
285 default 2
286
282source "init/Kconfig" 287source "init/Kconfig"
283source "kernel/Kconfig.freezer" 288source "kernel/Kconfig.freezer"
284 289
@@ -716,17 +721,6 @@ endif #HYPERVISOR_GUEST
716config NO_BOOTMEM 721config NO_BOOTMEM
717 def_bool y 722 def_bool y
718 723
719config MEMTEST
720 bool "Memtest"
721 ---help---
722 This option adds a kernel parameter 'memtest', which allows memtest
723 to be set.
724 memtest=0, mean disabled; -- default
725 memtest=1, mean do 1 test pattern;
726 ...
727 memtest=4, mean do 4 test patterns.
728 If you are unsure how to answer this question, answer N.
729
730source "arch/x86/Kconfig.cpu" 724source "arch/x86/Kconfig.cpu"
731 725
732config HPET_TIMER 726config HPET_TIMER
@@ -891,7 +885,8 @@ config UP_LATE_INIT
891 depends on !SMP && X86_LOCAL_APIC 885 depends on !SMP && X86_LOCAL_APIC
892 886
893config X86_UP_APIC 887config X86_UP_APIC
894 bool "Local APIC support on uniprocessors" 888 bool "Local APIC support on uniprocessors" if !PCI_MSI
889 default PCI_MSI
895 depends on X86_32 && !SMP && !X86_32_NON_STANDARD 890 depends on X86_32 && !SMP && !X86_32_NON_STANDARD
896 ---help--- 891 ---help---
897 A local APIC (Advanced Programmable Interrupt Controller) is an 892 A local APIC (Advanced Programmable Interrupt Controller) is an
@@ -903,10 +898,6 @@ config X86_UP_APIC
903 performance counters), and the NMI watchdog which detects hard 898 performance counters), and the NMI watchdog which detects hard
904 lockups. 899 lockups.
905 900
906config X86_UP_APIC_MSI
907 def_bool y
908 select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI
909
910config X86_UP_IOAPIC 901config X86_UP_IOAPIC
911 bool "IO-APIC support on uniprocessors" 902 bool "IO-APIC support on uniprocessors"
912 depends on X86_UP_APIC 903 depends on X86_UP_APIC
@@ -925,8 +916,8 @@ config X86_LOCAL_APIC
925 select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ 916 select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
926 917
927config X86_IO_APIC 918config X86_IO_APIC
928 def_bool X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC 919 def_bool y
929 depends on X86_LOCAL_APIC 920 depends on X86_LOCAL_APIC || X86_UP_IOAPIC
930 select IRQ_DOMAIN 921 select IRQ_DOMAIN
931 922
932config X86_REROUTE_FOR_BROKEN_BOOT_IRQS 923config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -1145,10 +1136,10 @@ config MICROCODE_OLD_INTERFACE
1145 depends on MICROCODE 1136 depends on MICROCODE
1146 1137
1147config MICROCODE_INTEL_EARLY 1138config MICROCODE_INTEL_EARLY
1148 def_bool n 1139 bool
1149 1140
1150config MICROCODE_AMD_EARLY 1141config MICROCODE_AMD_EARLY
1151 def_bool n 1142 bool
1152 1143
1153config MICROCODE_EARLY 1144config MICROCODE_EARLY
1154 bool "Early load microcode" 1145 bool "Early load microcode"
@@ -1300,14 +1291,14 @@ config ARCH_DMA_ADDR_T_64BIT
1300 def_bool y 1291 def_bool y
1301 depends on X86_64 || HIGHMEM64G 1292 depends on X86_64 || HIGHMEM64G
1302 1293
1303config DIRECT_GBPAGES 1294config X86_DIRECT_GBPAGES
1304 bool "Enable 1GB pages for kernel pagetables" if EXPERT 1295 def_bool y
1305 default y 1296 depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
1306 depends on X86_64
1307 ---help--- 1297 ---help---
1308 Allow the kernel linear mapping to use 1GB pages on CPUs that 1298 Certain kernel features effectively disable kernel
1309 support it. This can improve the kernel's performance a tiny bit by 1299 linear 1 GB mappings (even if the CPU otherwise
1310 reducing TLB pressure. If in doubt, say "Y". 1300 supports them), so don't confuse the user by printing
1301 that we have them enabled.
1311 1302
1312# Common NUMA Features 1303# Common NUMA Features
1313config NUMA 1304config NUMA
@@ -1430,6 +1421,16 @@ config ILLEGAL_POINTER_VALUE
1430 1421
1431source "mm/Kconfig" 1422source "mm/Kconfig"
1432 1423
1424config X86_PMEM_LEGACY
1425 bool "Support non-standard NVDIMMs and ADR protected memory"
1426 help
1427 Treat memory marked using the non-standard e820 type of 12 as used
1428 by the Intel Sandy Bridge-EP reference BIOS as protected memory.
1429 The kernel will offer these regions to the 'pmem' driver so
1430 they can be used for persistent storage.
1431
1432 Say Y if unsure.
1433
1433config HIGHPTE 1434config HIGHPTE
1434 bool "Allocate 3rd-level pagetables from highmem" 1435 bool "Allocate 3rd-level pagetables from highmem"
1435 depends on HIGHMEM 1436 depends on HIGHMEM
@@ -1747,14 +1748,11 @@ config KEXEC_VERIFY_SIG
1747 depends on KEXEC_FILE 1748 depends on KEXEC_FILE
1748 ---help--- 1749 ---help---
1749 This option makes kernel signature verification mandatory for 1750 This option makes kernel signature verification mandatory for
1750 kexec_file_load() syscall. If kernel is signature can not be 1751 the kexec_file_load() syscall.
1751 verified, kexec_file_load() will fail. 1752
1752 1753 In addition to that option, you need to enable signature
1753 This option enforces signature verification at generic level. 1754 verification for the corresponding kernel image type being
1754 One needs to enable signature verification for type of kernel 1755 loaded in order for this to work.
1755 image being loaded to make sure it works. For example, enable
1756 bzImage signature verification option to be able to load and
1757 verify signatures of bzImage. Otherwise kernel loading will fail.
1758 1756
1759config KEXEC_BZIMAGE_VERIFY_SIG 1757config KEXEC_BZIMAGE_VERIFY_SIG
1760 bool "Enable bzImage signature verification support" 1758 bool "Enable bzImage signature verification support"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5ba2d9ce82dc..2fda005bb334 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -63,7 +63,7 @@ ifeq ($(CONFIG_X86_32),y)
63 $(call cc-option,-fno-unit-at-a-time)) 63 $(call cc-option,-fno-unit-at-a-time))
64 64
65 # CPU-specific tuning. Anything which can be shared with UML should go here. 65 # CPU-specific tuning. Anything which can be shared with UML should go here.
66 include $(srctree)/arch/x86/Makefile_32.cpu 66 include arch/x86/Makefile_32.cpu
67 KBUILD_CFLAGS += $(cflags-y) 67 KBUILD_CFLAGS += $(cflags-y)
68 68
69 # temporary until string.h is fixed 69 # temporary until string.h is fixed
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 95eba554baf9..5b7e898ffd9a 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -18,7 +18,7 @@ LDS_EXTRA := -Ui386
18export LDS_EXTRA 18export LDS_EXTRA
19 19
20# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. 20# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
21include $(srctree)/arch/x86/Makefile_32.cpu 21include arch/x86/Makefile_32.cpu
22 22
23# prevent gcc from keeping the stack 16 byte aligned. Taken from i386. 23# prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
24cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) 24cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index bb1376381985..d7b1f655b3ef 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum,
295 return slots_fetch_random(); 295 return slots_fetch_random();
296} 296}
297 297
298unsigned char *choose_kernel_location(unsigned char *input, 298unsigned char *choose_kernel_location(struct boot_params *boot_params,
299 unsigned char *input,
299 unsigned long input_size, 300 unsigned long input_size,
300 unsigned char *output, 301 unsigned char *output,
301 unsigned long output_size) 302 unsigned long output_size)
@@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
315 } 316 }
316#endif 317#endif
317 318
319 boot_params->hdr.loadflags |= KASLR_FLAG;
320
318 /* Record the various known unsafe memory ranges. */ 321 /* Record the various known unsafe memory ranges. */
319 mem_avoid_init((unsigned long)input, input_size, 322 mem_avoid_init((unsigned long)input, input_size,
320 (unsigned long)output, output_size); 323 (unsigned long)output, output_size);
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1d7fbbcc196d..8ef964ddc18e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -29,6 +29,7 @@
29#include <asm/page_types.h> 29#include <asm/page_types.h>
30#include <asm/boot.h> 30#include <asm/boot.h>
31#include <asm/asm-offsets.h> 31#include <asm/asm-offsets.h>
32#include <asm/bootparam.h>
32 33
33 __HEAD 34 __HEAD
34ENTRY(startup_32) 35ENTRY(startup_32)
@@ -102,7 +103,7 @@ preferred_addr:
102 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 103 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
103 * us to not reload segments 104 * us to not reload segments
104 */ 105 */
105 testb $(1<<6), BP_loadflags(%esi) 106 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
106 jnz 1f 107 jnz 1f
107 108
108 cli 109 cli
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6b1766c6c082..b0c0d16ef58d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -31,6 +31,7 @@
31#include <asm/msr.h> 31#include <asm/msr.h>
32#include <asm/processor-flags.h> 32#include <asm/processor-flags.h>
33#include <asm/asm-offsets.h> 33#include <asm/asm-offsets.h>
34#include <asm/bootparam.h>
34 35
35 __HEAD 36 __HEAD
36 .code32 37 .code32
@@ -46,7 +47,7 @@ ENTRY(startup_32)
46 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 47 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
47 * us to not reload segments 48 * us to not reload segments
48 */ 49 */
49 testb $(1<<6), BP_loadflags(%esi) 50 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
50 jnz 1f 51 jnz 1f
51 52
52 cli 53 cli
@@ -164,7 +165,7 @@ ENTRY(startup_32)
164 /* After gdt is loaded */ 165 /* After gdt is loaded */
165 xorl %eax, %eax 166 xorl %eax, %eax
166 lldt %ax 167 lldt %ax
167 movl $0x20, %eax 168 movl $__BOOT_TSS, %eax
168 ltr %ax 169 ltr %ax
169 170
170 /* 171 /*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a950864a64da..a107b935e22f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
377 377
378 real_mode = rmode; 378 real_mode = rmode;
379 379
380 /* Clear it for solely in-kernel use */
381 real_mode->hdr.loadflags &= ~KASLR_FLAG;
382
380 sanitize_boot_params(real_mode); 383 sanitize_boot_params(real_mode);
381 384
382 if (real_mode->screen_info.orig_video_mode == 7) { 385 if (real_mode->screen_info.orig_video_mode == 7) {
@@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
401 * the entire decompressed kernel plus relocation table, or the 404 * the entire decompressed kernel plus relocation table, or the
402 * entire decompressed kernel plus .bss and .brk sections. 405 * entire decompressed kernel plus .bss and .brk sections.
403 */ 406 */
404 output = choose_kernel_location(input_data, input_len, output, 407 output = choose_kernel_location(real_mode, input_data, input_len, output,
405 output_len > run_size ? output_len 408 output_len > run_size ? output_len
406 : run_size); 409 : run_size);
407 410
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 04477d68403f..89dd0d78013a 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option);
57 57
58#if CONFIG_RANDOMIZE_BASE 58#if CONFIG_RANDOMIZE_BASE
59/* aslr.c */ 59/* aslr.c */
60unsigned char *choose_kernel_location(unsigned char *input, 60unsigned char *choose_kernel_location(struct boot_params *boot_params,
61 unsigned char *input,
61 unsigned long input_size, 62 unsigned long input_size,
62 unsigned char *output, 63 unsigned char *output,
63 unsigned long output_size); 64 unsigned long output_size);
@@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
65bool has_cpuflag(int flag); 66bool has_cpuflag(int flag);
66#else 67#else
67static inline 68static inline
68unsigned char *choose_kernel_location(unsigned char *input, 69unsigned char *choose_kernel_location(struct boot_params *boot_params,
70 unsigned char *input,
69 unsigned long input_size, 71 unsigned long input_size,
70 unsigned char *output, 72 unsigned char *output,
71 unsigned long output_size) 73 unsigned long output_size)
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 493f3fd9f139..318b8465d302 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,7 +30,7 @@ int strcmp(const char *str1, const char *str2)
30 int delta = 0; 30 int delta = 0;
31 31
32 while (*s1 || *s2) { 32 while (*s1 || *s2) {
33 delta = *s2 - *s1; 33 delta = *s1 - *s2;
34 if (delta) 34 if (delta)
35 return delta; 35 return delta;
36 s1++; 36 s1++;
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
index 748e8d06290a..aa8a96b052e3 100644
--- a/arch/x86/boot/video-mode.c
+++ b/arch/x86/boot/video-mode.c
@@ -22,10 +22,8 @@
22/* 22/*
23 * Common variables 23 * Common variables
24 */ 24 */
25int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */ 25int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
26u16 video_segment;
27int force_x, force_y; /* Don't query the BIOS for cols/rows */ 26int force_x, force_y; /* Don't query the BIOS for cols/rows */
28
29int do_restore; /* Screen contents changed during mode flip */ 27int do_restore; /* Screen contents changed during mode flip */
30int graphic_mode; /* Graphic mode with linear frame buffer */ 28int graphic_mode; /* Graphic mode with linear frame buffer */
31 29
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 43eda284d27f..05111bb8d018 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -17,6 +17,8 @@
17#include "video.h" 17#include "video.h"
18#include "vesa.h" 18#include "vesa.h"
19 19
20static u16 video_segment;
21
20static void store_cursor_position(void) 22static void store_cursor_position(void)
21{ 23{
22 struct biosregs ireg, oreg; 24 struct biosregs ireg, oreg;
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index 0bb25491262d..b54e0328c449 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -91,7 +91,6 @@ int mode_defined(u16 mode); /* video.c */
91#define ADAPTER_VGA 2 91#define ADAPTER_VGA 2
92 92
93extern int adapter; 93extern int adapter;
94extern u16 video_segment;
95extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ 94extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
96extern int do_restore; /* Restore screen contents */ 95extern int do_restore; /* Restore screen contents */
97extern int graphic_mode; /* Graphics mode with linear frame buffer */ 96extern int graphic_mode; /* Graphics mode with linear frame buffer */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 419819d6dab3..aaa1118bf01e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -248,7 +248,7 @@ CONFIG_USB=y
248CONFIG_USB_ANNOUNCE_NEW_DEVICES=y 248CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
249CONFIG_USB_MON=y 249CONFIG_USB_MON=y
250CONFIG_USB_EHCI_HCD=y 250CONFIG_USB_EHCI_HCD=y
251# CONFIG_USB_EHCI_TT_NEWSCHED is not set 251CONFIG_USB_EHCI_TT_NEWSCHED=y
252CONFIG_USB_OHCI_HCD=y 252CONFIG_USB_OHCI_HCD=y
253CONFIG_USB_UHCI_HCD=y 253CONFIG_USB_UHCI_HCD=y
254CONFIG_USB_PRINTER=y 254CONFIG_USB_PRINTER=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 4c311ddd973b..315b86106572 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -243,7 +243,7 @@ CONFIG_USB=y
243CONFIG_USB_ANNOUNCE_NEW_DEVICES=y 243CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
244CONFIG_USB_MON=y 244CONFIG_USB_MON=y
245CONFIG_USB_EHCI_HCD=y 245CONFIG_USB_EHCI_HCD=y
246# CONFIG_USB_EHCI_TT_NEWSCHED is not set 246CONFIG_USB_EHCI_TT_NEWSCHED=y
247CONFIG_USB_OHCI_HCD=y 247CONFIG_USB_OHCI_HCD=y
248CONFIG_USB_UHCI_HCD=y 248CONFIG_USB_UHCI_HCD=y
249CONFIG_USB_PRINTER=y 249CONFIG_USB_PRINTER=y
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 54f60ab41c63..112cefacf2af 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -797,7 +797,9 @@ static int rfc4106_init(struct crypto_tfm *tfm)
797 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); 797 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
798 struct crypto_aead *cryptd_child; 798 struct crypto_aead *cryptd_child;
799 struct aesni_rfc4106_gcm_ctx *child_ctx; 799 struct aesni_rfc4106_gcm_ctx *child_ctx;
800 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); 800 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
801 CRYPTO_ALG_INTERNAL,
802 CRYPTO_ALG_INTERNAL);
801 if (IS_ERR(cryptd_tfm)) 803 if (IS_ERR(cryptd_tfm))
802 return PTR_ERR(cryptd_tfm); 804 return PTR_ERR(cryptd_tfm);
803 805
@@ -890,15 +892,12 @@ out_free_ablkcipher:
890 return ret; 892 return ret;
891} 893}
892 894
893static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, 895static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
894 unsigned int key_len) 896 unsigned int key_len)
895{ 897{
896 int ret = 0; 898 int ret = 0;
897 struct crypto_tfm *tfm = crypto_aead_tfm(parent); 899 struct crypto_tfm *tfm = crypto_aead_tfm(aead);
898 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); 900 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
899 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
900 struct aesni_rfc4106_gcm_ctx *child_ctx =
901 aesni_rfc4106_gcm_ctx_get(cryptd_child);
902 u8 *new_key_align, *new_key_mem = NULL; 901 u8 *new_key_align, *new_key_mem = NULL;
903 902
904 if (key_len < 4) { 903 if (key_len < 4) {
@@ -943,20 +942,31 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
943 goto exit; 942 goto exit;
944 } 943 }
945 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); 944 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
946 memcpy(child_ctx, ctx, sizeof(*ctx));
947exit: 945exit:
948 kfree(new_key_mem); 946 kfree(new_key_mem);
949 return ret; 947 return ret;
950} 948}
951 949
952/* This is the Integrity Check Value (aka the authentication tag length and can 950static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
953 * be 8, 12 or 16 bytes long. */ 951 unsigned int key_len)
954static int rfc4106_set_authsize(struct crypto_aead *parent,
955 unsigned int authsize)
956{ 952{
957 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); 953 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
958 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); 954 struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
955 struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child);
956 struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm;
957 int ret;
959 958
959 ret = crypto_aead_setkey(child, key, key_len);
960 if (!ret) {
961 memcpy(ctx, c_ctx, sizeof(*ctx));
962 ctx->cryptd_tfm = cryptd_tfm;
963 }
964 return ret;
965}
966
967static int common_rfc4106_set_authsize(struct crypto_aead *aead,
968 unsigned int authsize)
969{
960 switch (authsize) { 970 switch (authsize) {
961 case 8: 971 case 8:
962 case 12: 972 case 12:
@@ -965,51 +975,23 @@ static int rfc4106_set_authsize(struct crypto_aead *parent,
965 default: 975 default:
966 return -EINVAL; 976 return -EINVAL;
967 } 977 }
968 crypto_aead_crt(parent)->authsize = authsize; 978 crypto_aead_crt(aead)->authsize = authsize;
969 crypto_aead_crt(cryptd_child)->authsize = authsize;
970 return 0; 979 return 0;
971} 980}
972 981
973static int rfc4106_encrypt(struct aead_request *req) 982/* This is the Integrity Check Value (aka the authentication tag length and can
974{ 983 * be 8, 12 or 16 bytes long. */
975 int ret; 984static int rfc4106_set_authsize(struct crypto_aead *parent,
976 struct crypto_aead *tfm = crypto_aead_reqtfm(req); 985 unsigned int authsize)
977 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
978
979 if (!irq_fpu_usable()) {
980 struct aead_request *cryptd_req =
981 (struct aead_request *) aead_request_ctx(req);
982 memcpy(cryptd_req, req, sizeof(*req));
983 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
984 return crypto_aead_encrypt(cryptd_req);
985 } else {
986 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
987 kernel_fpu_begin();
988 ret = cryptd_child->base.crt_aead.encrypt(req);
989 kernel_fpu_end();
990 return ret;
991 }
992}
993
994static int rfc4106_decrypt(struct aead_request *req)
995{ 986{
987 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
988 struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
996 int ret; 989 int ret;
997 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
998 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
999 990
1000 if (!irq_fpu_usable()) { 991 ret = crypto_aead_setauthsize(child, authsize);
1001 struct aead_request *cryptd_req = 992 if (!ret)
1002 (struct aead_request *) aead_request_ctx(req); 993 crypto_aead_crt(parent)->authsize = authsize;
1003 memcpy(cryptd_req, req, sizeof(*req)); 994 return ret;
1004 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1005 return crypto_aead_decrypt(cryptd_req);
1006 } else {
1007 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1008 kernel_fpu_begin();
1009 ret = cryptd_child->base.crt_aead.decrypt(req);
1010 kernel_fpu_end();
1011 return ret;
1012 }
1013} 995}
1014 996
1015static int __driver_rfc4106_encrypt(struct aead_request *req) 997static int __driver_rfc4106_encrypt(struct aead_request *req)
@@ -1185,6 +1167,78 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
1185 } 1167 }
1186 return retval; 1168 return retval;
1187} 1169}
1170
1171static int rfc4106_encrypt(struct aead_request *req)
1172{
1173 int ret;
1174 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1175 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1176
1177 if (!irq_fpu_usable()) {
1178 struct aead_request *cryptd_req =
1179 (struct aead_request *) aead_request_ctx(req);
1180
1181 memcpy(cryptd_req, req, sizeof(*req));
1182 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1183 ret = crypto_aead_encrypt(cryptd_req);
1184 } else {
1185 kernel_fpu_begin();
1186 ret = __driver_rfc4106_encrypt(req);
1187 kernel_fpu_end();
1188 }
1189 return ret;
1190}
1191
1192static int rfc4106_decrypt(struct aead_request *req)
1193{
1194 int ret;
1195 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1196 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1197
1198 if (!irq_fpu_usable()) {
1199 struct aead_request *cryptd_req =
1200 (struct aead_request *) aead_request_ctx(req);
1201
1202 memcpy(cryptd_req, req, sizeof(*req));
1203 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1204 ret = crypto_aead_decrypt(cryptd_req);
1205 } else {
1206 kernel_fpu_begin();
1207 ret = __driver_rfc4106_decrypt(req);
1208 kernel_fpu_end();
1209 }
1210 return ret;
1211}
1212
1213static int helper_rfc4106_encrypt(struct aead_request *req)
1214{
1215 int ret;
1216
1217 if (unlikely(!irq_fpu_usable())) {
1218 WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
1219 ret = -EINVAL;
1220 } else {
1221 kernel_fpu_begin();
1222 ret = __driver_rfc4106_encrypt(req);
1223 kernel_fpu_end();
1224 }
1225 return ret;
1226}
1227
1228static int helper_rfc4106_decrypt(struct aead_request *req)
1229{
1230 int ret;
1231
1232 if (unlikely(!irq_fpu_usable())) {
1233 WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
1234 ret = -EINVAL;
1235 } else {
1236 kernel_fpu_begin();
1237 ret = __driver_rfc4106_decrypt(req);
1238 kernel_fpu_end();
1239 }
1240 return ret;
1241}
1188#endif 1242#endif
1189 1243
1190static struct crypto_alg aesni_algs[] = { { 1244static struct crypto_alg aesni_algs[] = { {
@@ -1210,7 +1264,7 @@ static struct crypto_alg aesni_algs[] = { {
1210 .cra_name = "__aes-aesni", 1264 .cra_name = "__aes-aesni",
1211 .cra_driver_name = "__driver-aes-aesni", 1265 .cra_driver_name = "__driver-aes-aesni",
1212 .cra_priority = 0, 1266 .cra_priority = 0,
1213 .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 1267 .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
1214 .cra_blocksize = AES_BLOCK_SIZE, 1268 .cra_blocksize = AES_BLOCK_SIZE,
1215 .cra_ctxsize = sizeof(struct crypto_aes_ctx) + 1269 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1216 AESNI_ALIGN - 1, 1270 AESNI_ALIGN - 1,
@@ -1229,7 +1283,8 @@ static struct crypto_alg aesni_algs[] = { {
1229 .cra_name = "__ecb-aes-aesni", 1283 .cra_name = "__ecb-aes-aesni",
1230 .cra_driver_name = "__driver-ecb-aes-aesni", 1284 .cra_driver_name = "__driver-ecb-aes-aesni",
1231 .cra_priority = 0, 1285 .cra_priority = 0,
1232 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1286 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1287 CRYPTO_ALG_INTERNAL,
1233 .cra_blocksize = AES_BLOCK_SIZE, 1288 .cra_blocksize = AES_BLOCK_SIZE,
1234 .cra_ctxsize = sizeof(struct crypto_aes_ctx) + 1289 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1235 AESNI_ALIGN - 1, 1290 AESNI_ALIGN - 1,
@@ -1249,7 +1304,8 @@ static struct crypto_alg aesni_algs[] = { {
1249 .cra_name = "__cbc-aes-aesni", 1304 .cra_name = "__cbc-aes-aesni",
1250 .cra_driver_name = "__driver-cbc-aes-aesni", 1305 .cra_driver_name = "__driver-cbc-aes-aesni",
1251 .cra_priority = 0, 1306 .cra_priority = 0,
1252 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1307 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1308 CRYPTO_ALG_INTERNAL,
1253 .cra_blocksize = AES_BLOCK_SIZE, 1309 .cra_blocksize = AES_BLOCK_SIZE,
1254 .cra_ctxsize = sizeof(struct crypto_aes_ctx) + 1310 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1255 AESNI_ALIGN - 1, 1311 AESNI_ALIGN - 1,
@@ -1313,7 +1369,8 @@ static struct crypto_alg aesni_algs[] = { {
1313 .cra_name = "__ctr-aes-aesni", 1369 .cra_name = "__ctr-aes-aesni",
1314 .cra_driver_name = "__driver-ctr-aes-aesni", 1370 .cra_driver_name = "__driver-ctr-aes-aesni",
1315 .cra_priority = 0, 1371 .cra_priority = 0,
1316 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1372 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1373 CRYPTO_ALG_INTERNAL,
1317 .cra_blocksize = 1, 1374 .cra_blocksize = 1,
1318 .cra_ctxsize = sizeof(struct crypto_aes_ctx) + 1375 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1319 AESNI_ALIGN - 1, 1376 AESNI_ALIGN - 1,
@@ -1357,7 +1414,7 @@ static struct crypto_alg aesni_algs[] = { {
1357 .cra_name = "__gcm-aes-aesni", 1414 .cra_name = "__gcm-aes-aesni",
1358 .cra_driver_name = "__driver-gcm-aes-aesni", 1415 .cra_driver_name = "__driver-gcm-aes-aesni",
1359 .cra_priority = 0, 1416 .cra_priority = 0,
1360 .cra_flags = CRYPTO_ALG_TYPE_AEAD, 1417 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL,
1361 .cra_blocksize = 1, 1418 .cra_blocksize = 1,
1362 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + 1419 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) +
1363 AESNI_ALIGN, 1420 AESNI_ALIGN,
@@ -1366,8 +1423,12 @@ static struct crypto_alg aesni_algs[] = { {
1366 .cra_module = THIS_MODULE, 1423 .cra_module = THIS_MODULE,
1367 .cra_u = { 1424 .cra_u = {
1368 .aead = { 1425 .aead = {
1369 .encrypt = __driver_rfc4106_encrypt, 1426 .setkey = common_rfc4106_set_key,
1370 .decrypt = __driver_rfc4106_decrypt, 1427 .setauthsize = common_rfc4106_set_authsize,
1428 .encrypt = helper_rfc4106_encrypt,
1429 .decrypt = helper_rfc4106_decrypt,
1430 .ivsize = 8,
1431 .maxauthsize = 16,
1371 }, 1432 },
1372 }, 1433 },
1373}, { 1434}, {
@@ -1423,7 +1484,8 @@ static struct crypto_alg aesni_algs[] = { {
1423 .cra_name = "__lrw-aes-aesni", 1484 .cra_name = "__lrw-aes-aesni",
1424 .cra_driver_name = "__driver-lrw-aes-aesni", 1485 .cra_driver_name = "__driver-lrw-aes-aesni",
1425 .cra_priority = 0, 1486 .cra_priority = 0,
1426 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1487 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1488 CRYPTO_ALG_INTERNAL,
1427 .cra_blocksize = AES_BLOCK_SIZE, 1489 .cra_blocksize = AES_BLOCK_SIZE,
1428 .cra_ctxsize = sizeof(struct aesni_lrw_ctx), 1490 .cra_ctxsize = sizeof(struct aesni_lrw_ctx),
1429 .cra_alignmask = 0, 1491 .cra_alignmask = 0,
@@ -1444,7 +1506,8 @@ static struct crypto_alg aesni_algs[] = { {
1444 .cra_name = "__xts-aes-aesni", 1506 .cra_name = "__xts-aes-aesni",
1445 .cra_driver_name = "__driver-xts-aes-aesni", 1507 .cra_driver_name = "__driver-xts-aes-aesni",
1446 .cra_priority = 0, 1508 .cra_priority = 0,
1447 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1509 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1510 CRYPTO_ALG_INTERNAL,
1448 .cra_blocksize = AES_BLOCK_SIZE, 1511 .cra_blocksize = AES_BLOCK_SIZE,
1449 .cra_ctxsize = sizeof(struct aesni_xts_ctx), 1512 .cra_ctxsize = sizeof(struct aesni_xts_ctx),
1450 .cra_alignmask = 0, 1513 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 9a07fafe3831..baf0ac21ace5 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -343,7 +343,8 @@ static struct crypto_alg cmll_algs[10] = { {
343 .cra_name = "__ecb-camellia-aesni-avx2", 343 .cra_name = "__ecb-camellia-aesni-avx2",
344 .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", 344 .cra_driver_name = "__driver-ecb-camellia-aesni-avx2",
345 .cra_priority = 0, 345 .cra_priority = 0,
346 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 346 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
347 CRYPTO_ALG_INTERNAL,
347 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 348 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
348 .cra_ctxsize = sizeof(struct camellia_ctx), 349 .cra_ctxsize = sizeof(struct camellia_ctx),
349 .cra_alignmask = 0, 350 .cra_alignmask = 0,
@@ -362,7 +363,8 @@ static struct crypto_alg cmll_algs[10] = { {
362 .cra_name = "__cbc-camellia-aesni-avx2", 363 .cra_name = "__cbc-camellia-aesni-avx2",
363 .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", 364 .cra_driver_name = "__driver-cbc-camellia-aesni-avx2",
364 .cra_priority = 0, 365 .cra_priority = 0,
365 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 366 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
367 CRYPTO_ALG_INTERNAL,
366 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 368 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
367 .cra_ctxsize = sizeof(struct camellia_ctx), 369 .cra_ctxsize = sizeof(struct camellia_ctx),
368 .cra_alignmask = 0, 370 .cra_alignmask = 0,
@@ -381,7 +383,8 @@ static struct crypto_alg cmll_algs[10] = { {
381 .cra_name = "__ctr-camellia-aesni-avx2", 383 .cra_name = "__ctr-camellia-aesni-avx2",
382 .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", 384 .cra_driver_name = "__driver-ctr-camellia-aesni-avx2",
383 .cra_priority = 0, 385 .cra_priority = 0,
384 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 386 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
387 CRYPTO_ALG_INTERNAL,
385 .cra_blocksize = 1, 388 .cra_blocksize = 1,
386 .cra_ctxsize = sizeof(struct camellia_ctx), 389 .cra_ctxsize = sizeof(struct camellia_ctx),
387 .cra_alignmask = 0, 390 .cra_alignmask = 0,
@@ -401,7 +404,8 @@ static struct crypto_alg cmll_algs[10] = { {
401 .cra_name = "__lrw-camellia-aesni-avx2", 404 .cra_name = "__lrw-camellia-aesni-avx2",
402 .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", 405 .cra_driver_name = "__driver-lrw-camellia-aesni-avx2",
403 .cra_priority = 0, 406 .cra_priority = 0,
404 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 407 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
408 CRYPTO_ALG_INTERNAL,
405 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 409 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
406 .cra_ctxsize = sizeof(struct camellia_lrw_ctx), 410 .cra_ctxsize = sizeof(struct camellia_lrw_ctx),
407 .cra_alignmask = 0, 411 .cra_alignmask = 0,
@@ -424,7 +428,8 @@ static struct crypto_alg cmll_algs[10] = { {
424 .cra_name = "__xts-camellia-aesni-avx2", 428 .cra_name = "__xts-camellia-aesni-avx2",
425 .cra_driver_name = "__driver-xts-camellia-aesni-avx2", 429 .cra_driver_name = "__driver-xts-camellia-aesni-avx2",
426 .cra_priority = 0, 430 .cra_priority = 0,
427 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 431 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
432 CRYPTO_ALG_INTERNAL,
428 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 433 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
429 .cra_ctxsize = sizeof(struct camellia_xts_ctx), 434 .cra_ctxsize = sizeof(struct camellia_xts_ctx),
430 .cra_alignmask = 0, 435 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index ed38d959add6..78818a1e73e3 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -335,7 +335,8 @@ static struct crypto_alg cmll_algs[10] = { {
335 .cra_name = "__ecb-camellia-aesni", 335 .cra_name = "__ecb-camellia-aesni",
336 .cra_driver_name = "__driver-ecb-camellia-aesni", 336 .cra_driver_name = "__driver-ecb-camellia-aesni",
337 .cra_priority = 0, 337 .cra_priority = 0,
338 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 338 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
339 CRYPTO_ALG_INTERNAL,
339 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 340 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
340 .cra_ctxsize = sizeof(struct camellia_ctx), 341 .cra_ctxsize = sizeof(struct camellia_ctx),
341 .cra_alignmask = 0, 342 .cra_alignmask = 0,
@@ -354,7 +355,8 @@ static struct crypto_alg cmll_algs[10] = { {
354 .cra_name = "__cbc-camellia-aesni", 355 .cra_name = "__cbc-camellia-aesni",
355 .cra_driver_name = "__driver-cbc-camellia-aesni", 356 .cra_driver_name = "__driver-cbc-camellia-aesni",
356 .cra_priority = 0, 357 .cra_priority = 0,
357 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 358 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
359 CRYPTO_ALG_INTERNAL,
358 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 360 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
359 .cra_ctxsize = sizeof(struct camellia_ctx), 361 .cra_ctxsize = sizeof(struct camellia_ctx),
360 .cra_alignmask = 0, 362 .cra_alignmask = 0,
@@ -373,7 +375,8 @@ static struct crypto_alg cmll_algs[10] = { {
373 .cra_name = "__ctr-camellia-aesni", 375 .cra_name = "__ctr-camellia-aesni",
374 .cra_driver_name = "__driver-ctr-camellia-aesni", 376 .cra_driver_name = "__driver-ctr-camellia-aesni",
375 .cra_priority = 0, 377 .cra_priority = 0,
376 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 378 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
379 CRYPTO_ALG_INTERNAL,
377 .cra_blocksize = 1, 380 .cra_blocksize = 1,
378 .cra_ctxsize = sizeof(struct camellia_ctx), 381 .cra_ctxsize = sizeof(struct camellia_ctx),
379 .cra_alignmask = 0, 382 .cra_alignmask = 0,
@@ -393,7 +396,8 @@ static struct crypto_alg cmll_algs[10] = { {
393 .cra_name = "__lrw-camellia-aesni", 396 .cra_name = "__lrw-camellia-aesni",
394 .cra_driver_name = "__driver-lrw-camellia-aesni", 397 .cra_driver_name = "__driver-lrw-camellia-aesni",
395 .cra_priority = 0, 398 .cra_priority = 0,
396 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 399 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
400 CRYPTO_ALG_INTERNAL,
397 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 401 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
398 .cra_ctxsize = sizeof(struct camellia_lrw_ctx), 402 .cra_ctxsize = sizeof(struct camellia_lrw_ctx),
399 .cra_alignmask = 0, 403 .cra_alignmask = 0,
@@ -416,7 +420,8 @@ static struct crypto_alg cmll_algs[10] = { {
416 .cra_name = "__xts-camellia-aesni", 420 .cra_name = "__xts-camellia-aesni",
417 .cra_driver_name = "__driver-xts-camellia-aesni", 421 .cra_driver_name = "__driver-xts-camellia-aesni",
418 .cra_priority = 0, 422 .cra_priority = 0,
419 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 423 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
424 CRYPTO_ALG_INTERNAL,
420 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 425 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
421 .cra_ctxsize = sizeof(struct camellia_xts_ctx), 426 .cra_ctxsize = sizeof(struct camellia_xts_ctx),
422 .cra_alignmask = 0, 427 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 60ada677a928..236c80974457 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -341,7 +341,8 @@ static struct crypto_alg cast5_algs[6] = { {
341 .cra_name = "__ecb-cast5-avx", 341 .cra_name = "__ecb-cast5-avx",
342 .cra_driver_name = "__driver-ecb-cast5-avx", 342 .cra_driver_name = "__driver-ecb-cast5-avx",
343 .cra_priority = 0, 343 .cra_priority = 0,
344 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 344 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
345 CRYPTO_ALG_INTERNAL,
345 .cra_blocksize = CAST5_BLOCK_SIZE, 346 .cra_blocksize = CAST5_BLOCK_SIZE,
346 .cra_ctxsize = sizeof(struct cast5_ctx), 347 .cra_ctxsize = sizeof(struct cast5_ctx),
347 .cra_alignmask = 0, 348 .cra_alignmask = 0,
@@ -360,7 +361,8 @@ static struct crypto_alg cast5_algs[6] = { {
360 .cra_name = "__cbc-cast5-avx", 361 .cra_name = "__cbc-cast5-avx",
361 .cra_driver_name = "__driver-cbc-cast5-avx", 362 .cra_driver_name = "__driver-cbc-cast5-avx",
362 .cra_priority = 0, 363 .cra_priority = 0,
363 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 364 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
365 CRYPTO_ALG_INTERNAL,
364 .cra_blocksize = CAST5_BLOCK_SIZE, 366 .cra_blocksize = CAST5_BLOCK_SIZE,
365 .cra_ctxsize = sizeof(struct cast5_ctx), 367 .cra_ctxsize = sizeof(struct cast5_ctx),
366 .cra_alignmask = 0, 368 .cra_alignmask = 0,
@@ -379,7 +381,8 @@ static struct crypto_alg cast5_algs[6] = { {
379 .cra_name = "__ctr-cast5-avx", 381 .cra_name = "__ctr-cast5-avx",
380 .cra_driver_name = "__driver-ctr-cast5-avx", 382 .cra_driver_name = "__driver-ctr-cast5-avx",
381 .cra_priority = 0, 383 .cra_priority = 0,
382 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 384 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
385 CRYPTO_ALG_INTERNAL,
383 .cra_blocksize = 1, 386 .cra_blocksize = 1,
384 .cra_ctxsize = sizeof(struct cast5_ctx), 387 .cra_ctxsize = sizeof(struct cast5_ctx),
385 .cra_alignmask = 0, 388 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 0160f68a57ff..f448810ca4ac 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -372,7 +372,8 @@ static struct crypto_alg cast6_algs[10] = { {
372 .cra_name = "__ecb-cast6-avx", 372 .cra_name = "__ecb-cast6-avx",
373 .cra_driver_name = "__driver-ecb-cast6-avx", 373 .cra_driver_name = "__driver-ecb-cast6-avx",
374 .cra_priority = 0, 374 .cra_priority = 0,
375 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 375 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
376 CRYPTO_ALG_INTERNAL,
376 .cra_blocksize = CAST6_BLOCK_SIZE, 377 .cra_blocksize = CAST6_BLOCK_SIZE,
377 .cra_ctxsize = sizeof(struct cast6_ctx), 378 .cra_ctxsize = sizeof(struct cast6_ctx),
378 .cra_alignmask = 0, 379 .cra_alignmask = 0,
@@ -391,7 +392,8 @@ static struct crypto_alg cast6_algs[10] = { {
391 .cra_name = "__cbc-cast6-avx", 392 .cra_name = "__cbc-cast6-avx",
392 .cra_driver_name = "__driver-cbc-cast6-avx", 393 .cra_driver_name = "__driver-cbc-cast6-avx",
393 .cra_priority = 0, 394 .cra_priority = 0,
394 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 395 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
396 CRYPTO_ALG_INTERNAL,
395 .cra_blocksize = CAST6_BLOCK_SIZE, 397 .cra_blocksize = CAST6_BLOCK_SIZE,
396 .cra_ctxsize = sizeof(struct cast6_ctx), 398 .cra_ctxsize = sizeof(struct cast6_ctx),
397 .cra_alignmask = 0, 399 .cra_alignmask = 0,
@@ -410,7 +412,8 @@ static struct crypto_alg cast6_algs[10] = { {
410 .cra_name = "__ctr-cast6-avx", 412 .cra_name = "__ctr-cast6-avx",
411 .cra_driver_name = "__driver-ctr-cast6-avx", 413 .cra_driver_name = "__driver-ctr-cast6-avx",
412 .cra_priority = 0, 414 .cra_priority = 0,
413 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 415 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
416 CRYPTO_ALG_INTERNAL,
414 .cra_blocksize = 1, 417 .cra_blocksize = 1,
415 .cra_ctxsize = sizeof(struct cast6_ctx), 418 .cra_ctxsize = sizeof(struct cast6_ctx),
416 .cra_alignmask = 0, 419 .cra_alignmask = 0,
@@ -430,7 +433,8 @@ static struct crypto_alg cast6_algs[10] = { {
430 .cra_name = "__lrw-cast6-avx", 433 .cra_name = "__lrw-cast6-avx",
431 .cra_driver_name = "__driver-lrw-cast6-avx", 434 .cra_driver_name = "__driver-lrw-cast6-avx",
432 .cra_priority = 0, 435 .cra_priority = 0,
433 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 436 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
437 CRYPTO_ALG_INTERNAL,
434 .cra_blocksize = CAST6_BLOCK_SIZE, 438 .cra_blocksize = CAST6_BLOCK_SIZE,
435 .cra_ctxsize = sizeof(struct cast6_lrw_ctx), 439 .cra_ctxsize = sizeof(struct cast6_lrw_ctx),
436 .cra_alignmask = 0, 440 .cra_alignmask = 0,
@@ -453,7 +457,8 @@ static struct crypto_alg cast6_algs[10] = { {
453 .cra_name = "__xts-cast6-avx", 457 .cra_name = "__xts-cast6-avx",
454 .cra_driver_name = "__driver-xts-cast6-avx", 458 .cra_driver_name = "__driver-xts-cast6-avx",
455 .cra_priority = 0, 459 .cra_priority = 0,
456 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 460 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
461 CRYPTO_ALG_INTERNAL,
457 .cra_blocksize = CAST6_BLOCK_SIZE, 462 .cra_blocksize = CAST6_BLOCK_SIZE,
458 .cra_ctxsize = sizeof(struct cast6_xts_ctx), 463 .cra_ctxsize = sizeof(struct cast6_xts_ctx),
459 .cra_alignmask = 0, 464 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 26d49ebae040..225be06edc80 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -178,7 +178,7 @@ continue_block:
178 ## 2a) PROCESS FULL BLOCKS: 178 ## 2a) PROCESS FULL BLOCKS:
179 ################################################################ 179 ################################################################
180full_block: 180full_block:
181 movq $128,%rax 181 movl $128,%eax
182 lea 128*8*2(block_0), block_1 182 lea 128*8*2(block_0), block_1
183 lea 128*8*3(block_0), block_2 183 lea 128*8*3(block_0), block_2
184 add $128*8*1, block_0 184 add $128*8*1, block_0
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 8253d85aa165..2079baf06bdd 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = {
154 .cra_name = "__ghash", 154 .cra_name = "__ghash",
155 .cra_driver_name = "__ghash-pclmulqdqni", 155 .cra_driver_name = "__ghash-pclmulqdqni",
156 .cra_priority = 0, 156 .cra_priority = 0,
157 .cra_flags = CRYPTO_ALG_TYPE_SHASH, 157 .cra_flags = CRYPTO_ALG_TYPE_SHASH |
158 CRYPTO_ALG_INTERNAL,
158 .cra_blocksize = GHASH_BLOCK_SIZE, 159 .cra_blocksize = GHASH_BLOCK_SIZE,
159 .cra_ctxsize = sizeof(struct ghash_ctx), 160 .cra_ctxsize = sizeof(struct ghash_ctx),
160 .cra_module = THIS_MODULE, 161 .cra_module = THIS_MODULE,
@@ -261,7 +262,9 @@ static int ghash_async_init_tfm(struct crypto_tfm *tfm)
261 struct cryptd_ahash *cryptd_tfm; 262 struct cryptd_ahash *cryptd_tfm;
262 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); 263 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
263 264
264 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); 265 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
266 CRYPTO_ALG_INTERNAL,
267 CRYPTO_ALG_INTERNAL);
265 if (IS_ERR(cryptd_tfm)) 268 if (IS_ERR(cryptd_tfm))
266 return PTR_ERR(cryptd_tfm); 269 return PTR_ERR(cryptd_tfm);
267 ctx->cryptd_tfm = cryptd_tfm; 270 ctx->cryptd_tfm = cryptd_tfm;
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 432f1d76ceb8..6a85598931b5 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -232,7 +232,6 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
232 232
233 le128_to_be128((be128 *)walk->iv, &ctrblk); 233 le128_to_be128((be128 *)walk->iv, &ctrblk);
234} 234}
235EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
236 235
237static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, 236static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
238 struct blkcipher_desc *desc, 237 struct blkcipher_desc *desc,
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 437e47a4d302..2f63dc89e7a9 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -309,7 +309,8 @@ static struct crypto_alg srp_algs[10] = { {
309 .cra_name = "__ecb-serpent-avx2", 309 .cra_name = "__ecb-serpent-avx2",
310 .cra_driver_name = "__driver-ecb-serpent-avx2", 310 .cra_driver_name = "__driver-ecb-serpent-avx2",
311 .cra_priority = 0, 311 .cra_priority = 0,
312 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 312 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
313 CRYPTO_ALG_INTERNAL,
313 .cra_blocksize = SERPENT_BLOCK_SIZE, 314 .cra_blocksize = SERPENT_BLOCK_SIZE,
314 .cra_ctxsize = sizeof(struct serpent_ctx), 315 .cra_ctxsize = sizeof(struct serpent_ctx),
315 .cra_alignmask = 0, 316 .cra_alignmask = 0,
@@ -329,7 +330,8 @@ static struct crypto_alg srp_algs[10] = { {
329 .cra_name = "__cbc-serpent-avx2", 330 .cra_name = "__cbc-serpent-avx2",
330 .cra_driver_name = "__driver-cbc-serpent-avx2", 331 .cra_driver_name = "__driver-cbc-serpent-avx2",
331 .cra_priority = 0, 332 .cra_priority = 0,
332 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 333 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
334 CRYPTO_ALG_INTERNAL,
333 .cra_blocksize = SERPENT_BLOCK_SIZE, 335 .cra_blocksize = SERPENT_BLOCK_SIZE,
334 .cra_ctxsize = sizeof(struct serpent_ctx), 336 .cra_ctxsize = sizeof(struct serpent_ctx),
335 .cra_alignmask = 0, 337 .cra_alignmask = 0,
@@ -349,7 +351,8 @@ static struct crypto_alg srp_algs[10] = { {
349 .cra_name = "__ctr-serpent-avx2", 351 .cra_name = "__ctr-serpent-avx2",
350 .cra_driver_name = "__driver-ctr-serpent-avx2", 352 .cra_driver_name = "__driver-ctr-serpent-avx2",
351 .cra_priority = 0, 353 .cra_priority = 0,
352 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 354 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
355 CRYPTO_ALG_INTERNAL,
353 .cra_blocksize = 1, 356 .cra_blocksize = 1,
354 .cra_ctxsize = sizeof(struct serpent_ctx), 357 .cra_ctxsize = sizeof(struct serpent_ctx),
355 .cra_alignmask = 0, 358 .cra_alignmask = 0,
@@ -370,7 +373,8 @@ static struct crypto_alg srp_algs[10] = { {
370 .cra_name = "__lrw-serpent-avx2", 373 .cra_name = "__lrw-serpent-avx2",
371 .cra_driver_name = "__driver-lrw-serpent-avx2", 374 .cra_driver_name = "__driver-lrw-serpent-avx2",
372 .cra_priority = 0, 375 .cra_priority = 0,
373 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 376 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
377 CRYPTO_ALG_INTERNAL,
374 .cra_blocksize = SERPENT_BLOCK_SIZE, 378 .cra_blocksize = SERPENT_BLOCK_SIZE,
375 .cra_ctxsize = sizeof(struct serpent_lrw_ctx), 379 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
376 .cra_alignmask = 0, 380 .cra_alignmask = 0,
@@ -394,7 +398,8 @@ static struct crypto_alg srp_algs[10] = { {
394 .cra_name = "__xts-serpent-avx2", 398 .cra_name = "__xts-serpent-avx2",
395 .cra_driver_name = "__driver-xts-serpent-avx2", 399 .cra_driver_name = "__driver-xts-serpent-avx2",
396 .cra_priority = 0, 400 .cra_priority = 0,
397 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 401 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
402 CRYPTO_ALG_INTERNAL,
398 .cra_blocksize = SERPENT_BLOCK_SIZE, 403 .cra_blocksize = SERPENT_BLOCK_SIZE,
399 .cra_ctxsize = sizeof(struct serpent_xts_ctx), 404 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
400 .cra_alignmask = 0, 405 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 7e217398b4eb..c8d478af8456 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -378,7 +378,8 @@ static struct crypto_alg serpent_algs[10] = { {
378 .cra_name = "__ecb-serpent-avx", 378 .cra_name = "__ecb-serpent-avx",
379 .cra_driver_name = "__driver-ecb-serpent-avx", 379 .cra_driver_name = "__driver-ecb-serpent-avx",
380 .cra_priority = 0, 380 .cra_priority = 0,
381 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 381 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
382 CRYPTO_ALG_INTERNAL,
382 .cra_blocksize = SERPENT_BLOCK_SIZE, 383 .cra_blocksize = SERPENT_BLOCK_SIZE,
383 .cra_ctxsize = sizeof(struct serpent_ctx), 384 .cra_ctxsize = sizeof(struct serpent_ctx),
384 .cra_alignmask = 0, 385 .cra_alignmask = 0,
@@ -397,7 +398,8 @@ static struct crypto_alg serpent_algs[10] = { {
397 .cra_name = "__cbc-serpent-avx", 398 .cra_name = "__cbc-serpent-avx",
398 .cra_driver_name = "__driver-cbc-serpent-avx", 399 .cra_driver_name = "__driver-cbc-serpent-avx",
399 .cra_priority = 0, 400 .cra_priority = 0,
400 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 401 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
402 CRYPTO_ALG_INTERNAL,
401 .cra_blocksize = SERPENT_BLOCK_SIZE, 403 .cra_blocksize = SERPENT_BLOCK_SIZE,
402 .cra_ctxsize = sizeof(struct serpent_ctx), 404 .cra_ctxsize = sizeof(struct serpent_ctx),
403 .cra_alignmask = 0, 405 .cra_alignmask = 0,
@@ -416,7 +418,8 @@ static struct crypto_alg serpent_algs[10] = { {
416 .cra_name = "__ctr-serpent-avx", 418 .cra_name = "__ctr-serpent-avx",
417 .cra_driver_name = "__driver-ctr-serpent-avx", 419 .cra_driver_name = "__driver-ctr-serpent-avx",
418 .cra_priority = 0, 420 .cra_priority = 0,
419 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 421 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
422 CRYPTO_ALG_INTERNAL,
420 .cra_blocksize = 1, 423 .cra_blocksize = 1,
421 .cra_ctxsize = sizeof(struct serpent_ctx), 424 .cra_ctxsize = sizeof(struct serpent_ctx),
422 .cra_alignmask = 0, 425 .cra_alignmask = 0,
@@ -436,7 +439,8 @@ static struct crypto_alg serpent_algs[10] = { {
436 .cra_name = "__lrw-serpent-avx", 439 .cra_name = "__lrw-serpent-avx",
437 .cra_driver_name = "__driver-lrw-serpent-avx", 440 .cra_driver_name = "__driver-lrw-serpent-avx",
438 .cra_priority = 0, 441 .cra_priority = 0,
439 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 442 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
443 CRYPTO_ALG_INTERNAL,
440 .cra_blocksize = SERPENT_BLOCK_SIZE, 444 .cra_blocksize = SERPENT_BLOCK_SIZE,
441 .cra_ctxsize = sizeof(struct serpent_lrw_ctx), 445 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
442 .cra_alignmask = 0, 446 .cra_alignmask = 0,
@@ -459,7 +463,8 @@ static struct crypto_alg serpent_algs[10] = { {
459 .cra_name = "__xts-serpent-avx", 463 .cra_name = "__xts-serpent-avx",
460 .cra_driver_name = "__driver-xts-serpent-avx", 464 .cra_driver_name = "__driver-xts-serpent-avx",
461 .cra_priority = 0, 465 .cra_priority = 0,
462 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 466 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
467 CRYPTO_ALG_INTERNAL,
463 .cra_blocksize = SERPENT_BLOCK_SIZE, 468 .cra_blocksize = SERPENT_BLOCK_SIZE,
464 .cra_ctxsize = sizeof(struct serpent_xts_ctx), 469 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
465 .cra_alignmask = 0, 470 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index bf025adaea01..3643dd508f45 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -387,7 +387,8 @@ static struct crypto_alg serpent_algs[10] = { {
387 .cra_name = "__ecb-serpent-sse2", 387 .cra_name = "__ecb-serpent-sse2",
388 .cra_driver_name = "__driver-ecb-serpent-sse2", 388 .cra_driver_name = "__driver-ecb-serpent-sse2",
389 .cra_priority = 0, 389 .cra_priority = 0,
390 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 390 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
391 CRYPTO_ALG_INTERNAL,
391 .cra_blocksize = SERPENT_BLOCK_SIZE, 392 .cra_blocksize = SERPENT_BLOCK_SIZE,
392 .cra_ctxsize = sizeof(struct serpent_ctx), 393 .cra_ctxsize = sizeof(struct serpent_ctx),
393 .cra_alignmask = 0, 394 .cra_alignmask = 0,
@@ -406,7 +407,8 @@ static struct crypto_alg serpent_algs[10] = { {
406 .cra_name = "__cbc-serpent-sse2", 407 .cra_name = "__cbc-serpent-sse2",
407 .cra_driver_name = "__driver-cbc-serpent-sse2", 408 .cra_driver_name = "__driver-cbc-serpent-sse2",
408 .cra_priority = 0, 409 .cra_priority = 0,
409 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 410 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
411 CRYPTO_ALG_INTERNAL,
410 .cra_blocksize = SERPENT_BLOCK_SIZE, 412 .cra_blocksize = SERPENT_BLOCK_SIZE,
411 .cra_ctxsize = sizeof(struct serpent_ctx), 413 .cra_ctxsize = sizeof(struct serpent_ctx),
412 .cra_alignmask = 0, 414 .cra_alignmask = 0,
@@ -425,7 +427,8 @@ static struct crypto_alg serpent_algs[10] = { {
425 .cra_name = "__ctr-serpent-sse2", 427 .cra_name = "__ctr-serpent-sse2",
426 .cra_driver_name = "__driver-ctr-serpent-sse2", 428 .cra_driver_name = "__driver-ctr-serpent-sse2",
427 .cra_priority = 0, 429 .cra_priority = 0,
428 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 430 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
431 CRYPTO_ALG_INTERNAL,
429 .cra_blocksize = 1, 432 .cra_blocksize = 1,
430 .cra_ctxsize = sizeof(struct serpent_ctx), 433 .cra_ctxsize = sizeof(struct serpent_ctx),
431 .cra_alignmask = 0, 434 .cra_alignmask = 0,
@@ -445,7 +448,8 @@ static struct crypto_alg serpent_algs[10] = { {
445 .cra_name = "__lrw-serpent-sse2", 448 .cra_name = "__lrw-serpent-sse2",
446 .cra_driver_name = "__driver-lrw-serpent-sse2", 449 .cra_driver_name = "__driver-lrw-serpent-sse2",
447 .cra_priority = 0, 450 .cra_priority = 0,
448 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 451 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
452 CRYPTO_ALG_INTERNAL,
449 .cra_blocksize = SERPENT_BLOCK_SIZE, 453 .cra_blocksize = SERPENT_BLOCK_SIZE,
450 .cra_ctxsize = sizeof(struct serpent_lrw_ctx), 454 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
451 .cra_alignmask = 0, 455 .cra_alignmask = 0,
@@ -468,7 +472,8 @@ static struct crypto_alg serpent_algs[10] = { {
468 .cra_name = "__xts-serpent-sse2", 472 .cra_name = "__xts-serpent-sse2",
469 .cra_driver_name = "__driver-xts-serpent-sse2", 473 .cra_driver_name = "__driver-xts-serpent-sse2",
470 .cra_priority = 0, 474 .cra_priority = 0,
471 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 475 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
476 CRYPTO_ALG_INTERNAL,
472 .cra_blocksize = SERPENT_BLOCK_SIZE, 477 .cra_blocksize = SERPENT_BLOCK_SIZE,
473 .cra_ctxsize = sizeof(struct serpent_xts_ctx), 478 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
474 .cra_alignmask = 0, 479 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index fd9f6b035b16..e510b1c5d690 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -694,7 +694,8 @@ static struct shash_alg sha1_mb_shash_alg = {
694 * use ASYNC flag as some buffers in multi-buffer 694 * use ASYNC flag as some buffers in multi-buffer
695 * algo may not have completed before hashing thread sleep 695 * algo may not have completed before hashing thread sleep
696 */ 696 */
697 .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC, 697 .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC |
698 CRYPTO_ALG_INTERNAL,
698 .cra_blocksize = SHA1_BLOCK_SIZE, 699 .cra_blocksize = SHA1_BLOCK_SIZE,
699 .cra_module = THIS_MODULE, 700 .cra_module = THIS_MODULE,
700 .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), 701 .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list),
@@ -770,7 +771,9 @@ static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
770 struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); 771 struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
771 struct mcryptd_hash_ctx *mctx; 772 struct mcryptd_hash_ctx *mctx;
772 773
773 mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0); 774 mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
775 CRYPTO_ALG_INTERNAL,
776 CRYPTO_ALG_INTERNAL);
774 if (IS_ERR(mcryptd_tfm)) 777 if (IS_ERR(mcryptd_tfm))
775 return PTR_ERR(mcryptd_tfm); 778 return PTR_ERR(mcryptd_tfm);
776 mctx = crypto_ahash_ctx(&mcryptd_tfm->base); 779 mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
@@ -828,7 +831,7 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
828 while (!list_empty(&cstate->work_list)) { 831 while (!list_empty(&cstate->work_list)) {
829 rctx = list_entry(cstate->work_list.next, 832 rctx = list_entry(cstate->work_list.next,
830 struct mcryptd_hash_request_ctx, waiter); 833 struct mcryptd_hash_request_ctx, waiter);
831 if time_before(cur_time, rctx->tag.expire) 834 if (time_before(cur_time, rctx->tag.expire))
832 break; 835 break;
833 kernel_fpu_begin(); 836 kernel_fpu_begin();
834 sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); 837 sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr);
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
index 4ca7e166a2aa..822acb5b464c 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -56,7 +56,7 @@
56void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) 56void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
57{ 57{
58 unsigned int j; 58 unsigned int j;
59 state->unused_lanes = 0xF76543210; 59 state->unused_lanes = 0xF76543210ULL;
60 for (j = 0; j < 8; j++) { 60 for (j = 0; j < 8; j++) {
61 state->lens[j] = 0xFFFFFFFF; 61 state->lens[j] = 0xFFFFFFFF;
62 state->ldata[j].job_in_lane = NULL; 62 state->ldata[j].job_in_lane = NULL;
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 6c20fe04a738..33d1b9dc14cc 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -28,7 +28,7 @@
28#include <linux/cryptohash.h> 28#include <linux/cryptohash.h>
29#include <linux/types.h> 29#include <linux/types.h>
30#include <crypto/sha.h> 30#include <crypto/sha.h>
31#include <asm/byteorder.h> 31#include <crypto/sha1_base.h>
32#include <asm/i387.h> 32#include <asm/i387.h>
33#include <asm/xcr.h> 33#include <asm/xcr.h>
34#include <asm/xsave.h> 34#include <asm/xsave.h>
@@ -44,132 +44,51 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
44#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ 44#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
45 45
46asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, 46asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
47 unsigned int rounds); 47 unsigned int rounds);
48#endif 48#endif
49 49
50static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); 50static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
51
52
53static int sha1_ssse3_init(struct shash_desc *desc)
54{
55 struct sha1_state *sctx = shash_desc_ctx(desc);
56
57 *sctx = (struct sha1_state){
58 .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
59 };
60
61 return 0;
62}
63
64static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
65 unsigned int len, unsigned int partial)
66{
67 struct sha1_state *sctx = shash_desc_ctx(desc);
68 unsigned int done = 0;
69
70 sctx->count += len;
71
72 if (partial) {
73 done = SHA1_BLOCK_SIZE - partial;
74 memcpy(sctx->buffer + partial, data, done);
75 sha1_transform_asm(sctx->state, sctx->buffer, 1);
76 }
77
78 if (len - done >= SHA1_BLOCK_SIZE) {
79 const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
80
81 sha1_transform_asm(sctx->state, data + done, rounds);
82 done += rounds * SHA1_BLOCK_SIZE;
83 }
84
85 memcpy(sctx->buffer, data + done, len - done);
86
87 return 0;
88}
89 51
90static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, 52static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
91 unsigned int len) 53 unsigned int len)
92{ 54{
93 struct sha1_state *sctx = shash_desc_ctx(desc); 55 struct sha1_state *sctx = shash_desc_ctx(desc);
94 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
95 int res;
96 56
97 /* Handle the fast case right here */ 57 if (!irq_fpu_usable() ||
98 if (partial + len < SHA1_BLOCK_SIZE) { 58 (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
99 sctx->count += len; 59 return crypto_sha1_update(desc, data, len);
100 memcpy(sctx->buffer + partial, data, len);
101 60
102 return 0; 61 /* make sure casting to sha1_block_fn() is safe */
103 } 62 BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
104 63
105 if (!irq_fpu_usable()) { 64 kernel_fpu_begin();
106 res = crypto_sha1_update(desc, data, len); 65 sha1_base_do_update(desc, data, len,
107 } else { 66 (sha1_block_fn *)sha1_transform_asm);
108 kernel_fpu_begin(); 67 kernel_fpu_end();
109 res = __sha1_ssse3_update(desc, data, len, partial);
110 kernel_fpu_end();
111 }
112
113 return res;
114}
115
116
117/* Add padding and return the message digest. */
118static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
119{
120 struct sha1_state *sctx = shash_desc_ctx(desc);
121 unsigned int i, index, padlen;
122 __be32 *dst = (__be32 *)out;
123 __be64 bits;
124 static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
125
126 bits = cpu_to_be64(sctx->count << 3);
127
128 /* Pad out to 56 mod 64 and append length */
129 index = sctx->count % SHA1_BLOCK_SIZE;
130 padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
131 if (!irq_fpu_usable()) {
132 crypto_sha1_update(desc, padding, padlen);
133 crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
134 } else {
135 kernel_fpu_begin();
136 /* We need to fill a whole block for __sha1_ssse3_update() */
137 if (padlen <= 56) {
138 sctx->count += padlen;
139 memcpy(sctx->buffer + index, padding, padlen);
140 } else {
141 __sha1_ssse3_update(desc, padding, padlen, index);
142 }
143 __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
144 kernel_fpu_end();
145 }
146
147 /* Store state in digest */
148 for (i = 0; i < 5; i++)
149 dst[i] = cpu_to_be32(sctx->state[i]);
150
151 /* Wipe context */
152 memset(sctx, 0, sizeof(*sctx));
153 68
154 return 0; 69 return 0;
155} 70}
156 71
157static int sha1_ssse3_export(struct shash_desc *desc, void *out) 72static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
73 unsigned int len, u8 *out)
158{ 74{
159 struct sha1_state *sctx = shash_desc_ctx(desc); 75 if (!irq_fpu_usable())
76 return crypto_sha1_finup(desc, data, len, out);
160 77
161 memcpy(out, sctx, sizeof(*sctx)); 78 kernel_fpu_begin();
79 if (len)
80 sha1_base_do_update(desc, data, len,
81 (sha1_block_fn *)sha1_transform_asm);
82 sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm);
83 kernel_fpu_end();
162 84
163 return 0; 85 return sha1_base_finish(desc, out);
164} 86}
165 87
166static int sha1_ssse3_import(struct shash_desc *desc, const void *in) 88/* Add padding and return the message digest. */
89static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
167{ 90{
168 struct sha1_state *sctx = shash_desc_ctx(desc); 91 return sha1_ssse3_finup(desc, NULL, 0, out);
169
170 memcpy(sctx, in, sizeof(*sctx));
171
172 return 0;
173} 92}
174 93
175#ifdef CONFIG_AS_AVX2 94#ifdef CONFIG_AS_AVX2
@@ -186,13 +105,11 @@ static void sha1_apply_transform_avx2(u32 *digest, const char *data,
186 105
187static struct shash_alg alg = { 106static struct shash_alg alg = {
188 .digestsize = SHA1_DIGEST_SIZE, 107 .digestsize = SHA1_DIGEST_SIZE,
189 .init = sha1_ssse3_init, 108 .init = sha1_base_init,
190 .update = sha1_ssse3_update, 109 .update = sha1_ssse3_update,
191 .final = sha1_ssse3_final, 110 .final = sha1_ssse3_final,
192 .export = sha1_ssse3_export, 111 .finup = sha1_ssse3_finup,
193 .import = sha1_ssse3_import,
194 .descsize = sizeof(struct sha1_state), 112 .descsize = sizeof(struct sha1_state),
195 .statesize = sizeof(struct sha1_state),
196 .base = { 113 .base = {
197 .cra_name = "sha1", 114 .cra_name = "sha1",
198 .cra_driver_name= "sha1-ssse3", 115 .cra_driver_name= "sha1-ssse3",
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 642f15687a0a..92b3b5d75ba9 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -96,10 +96,10 @@ SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
96BYTE_FLIP_MASK = %xmm13 96BYTE_FLIP_MASK = %xmm13
97 97
98NUM_BLKS = %rdx # 3rd arg 98NUM_BLKS = %rdx # 3rd arg
99CTX = %rsi # 2nd arg 99INP = %rsi # 2nd arg
100INP = %rdi # 1st arg 100CTX = %rdi # 1st arg
101 101
102SRND = %rdi # clobbers INP 102SRND = %rsi # clobbers INP
103c = %ecx 103c = %ecx
104d = %r8d 104d = %r8d
105e = %edx 105e = %edx
@@ -342,8 +342,8 @@ a = TMP_
342 342
343######################################################################## 343########################################################################
344## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) 344## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
345## arg 1 : pointer to input data 345## arg 1 : pointer to digest
346## arg 2 : pointer to digest 346## arg 2 : pointer to input data
347## arg 3 : Num blocks 347## arg 3 : Num blocks
348######################################################################## 348########################################################################
349.text 349.text
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9e86944c539d..570ec5ec62d7 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -91,12 +91,12 @@ BYTE_FLIP_MASK = %ymm13
91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92 92
93NUM_BLKS = %rdx # 3rd arg 93NUM_BLKS = %rdx # 3rd arg
94CTX = %rsi # 2nd arg 94INP = %rsi # 2nd arg
95INP = %rdi # 1st arg 95CTX = %rdi # 1st arg
96c = %ecx 96c = %ecx
97d = %r8d 97d = %r8d
98e = %edx # clobbers NUM_BLKS 98e = %edx # clobbers NUM_BLKS
99y3 = %edi # clobbers INP 99y3 = %esi # clobbers INP
100 100
101 101
102TBL = %rbp 102TBL = %rbp
@@ -523,8 +523,8 @@ STACK_SIZE = _RSP + _RSP_SIZE
523 523
524######################################################################## 524########################################################################
525## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) 525## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
526## arg 1 : pointer to input data 526## arg 1 : pointer to digest
527## arg 2 : pointer to digest 527## arg 2 : pointer to input data
528## arg 3 : Num blocks 528## arg 3 : Num blocks
529######################################################################## 529########################################################################
530.text 530.text
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index f833b74d902b..2cedc44e8121 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -88,10 +88,10 @@ SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
88BYTE_FLIP_MASK = %xmm12 88BYTE_FLIP_MASK = %xmm12
89 89
90NUM_BLKS = %rdx # 3rd arg 90NUM_BLKS = %rdx # 3rd arg
91CTX = %rsi # 2nd arg 91INP = %rsi # 2nd arg
92INP = %rdi # 1st arg 92CTX = %rdi # 1st arg
93 93
94SRND = %rdi # clobbers INP 94SRND = %rsi # clobbers INP
95c = %ecx 95c = %ecx
96d = %r8d 96d = %r8d
97e = %edx 97e = %edx
@@ -348,8 +348,8 @@ a = TMP_
348 348
349######################################################################## 349########################################################################
350## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) 350## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
351## arg 1 : pointer to input data 351## arg 1 : pointer to digest
352## arg 2 : pointer to digest 352## arg 2 : pointer to input data
353## arg 3 : Num blocks 353## arg 3 : Num blocks
354######################################################################## 354########################################################################
355.text 355.text
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 8fad72f4dfd2..ccc338881ee8 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -36,195 +36,74 @@
36#include <linux/cryptohash.h> 36#include <linux/cryptohash.h>
37#include <linux/types.h> 37#include <linux/types.h>
38#include <crypto/sha.h> 38#include <crypto/sha.h>
39#include <asm/byteorder.h> 39#include <crypto/sha256_base.h>
40#include <asm/i387.h> 40#include <asm/i387.h>
41#include <asm/xcr.h> 41#include <asm/xcr.h>
42#include <asm/xsave.h> 42#include <asm/xsave.h>
43#include <linux/string.h> 43#include <linux/string.h>
44 44
45asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest, 45asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
46 u64 rounds); 46 u64 rounds);
47#ifdef CONFIG_AS_AVX 47#ifdef CONFIG_AS_AVX
48asmlinkage void sha256_transform_avx(const char *data, u32 *digest, 48asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
49 u64 rounds); 49 u64 rounds);
50#endif 50#endif
51#ifdef CONFIG_AS_AVX2 51#ifdef CONFIG_AS_AVX2
52asmlinkage void sha256_transform_rorx(const char *data, u32 *digest, 52asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
53 u64 rounds); 53 u64 rounds);
54#endif 54#endif
55 55
56static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64); 56static void (*sha256_transform_asm)(u32 *, const char *, u64);
57
58
59static int sha256_ssse3_init(struct shash_desc *desc)
60{
61 struct sha256_state *sctx = shash_desc_ctx(desc);
62
63 sctx->state[0] = SHA256_H0;
64 sctx->state[1] = SHA256_H1;
65 sctx->state[2] = SHA256_H2;
66 sctx->state[3] = SHA256_H3;
67 sctx->state[4] = SHA256_H4;
68 sctx->state[5] = SHA256_H5;
69 sctx->state[6] = SHA256_H6;
70 sctx->state[7] = SHA256_H7;
71 sctx->count = 0;
72
73 return 0;
74}
75
76static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
77 unsigned int len, unsigned int partial)
78{
79 struct sha256_state *sctx = shash_desc_ctx(desc);
80 unsigned int done = 0;
81
82 sctx->count += len;
83
84 if (partial) {
85 done = SHA256_BLOCK_SIZE - partial;
86 memcpy(sctx->buf + partial, data, done);
87 sha256_transform_asm(sctx->buf, sctx->state, 1);
88 }
89
90 if (len - done >= SHA256_BLOCK_SIZE) {
91 const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
92
93 sha256_transform_asm(data + done, sctx->state, (u64) rounds);
94
95 done += rounds * SHA256_BLOCK_SIZE;
96 }
97
98 memcpy(sctx->buf, data + done, len - done);
99
100 return 0;
101}
102 57
103static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, 58static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
104 unsigned int len) 59 unsigned int len)
105{ 60{
106 struct sha256_state *sctx = shash_desc_ctx(desc); 61 struct sha256_state *sctx = shash_desc_ctx(desc);
107 unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
108 int res;
109 62
110 /* Handle the fast case right here */ 63 if (!irq_fpu_usable() ||
111 if (partial + len < SHA256_BLOCK_SIZE) { 64 (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
112 sctx->count += len; 65 return crypto_sha256_update(desc, data, len);
113 memcpy(sctx->buf + partial, data, len);
114 66
115 return 0; 67 /* make sure casting to sha256_block_fn() is safe */
116 } 68 BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
117
118 if (!irq_fpu_usable()) {
119 res = crypto_sha256_update(desc, data, len);
120 } else {
121 kernel_fpu_begin();
122 res = __sha256_ssse3_update(desc, data, len, partial);
123 kernel_fpu_end();
124 }
125
126 return res;
127}
128 69
129 70 kernel_fpu_begin();
130/* Add padding and return the message digest. */ 71 sha256_base_do_update(desc, data, len,
131static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) 72 (sha256_block_fn *)sha256_transform_asm);
132{ 73 kernel_fpu_end();
133 struct sha256_state *sctx = shash_desc_ctx(desc);
134 unsigned int i, index, padlen;
135 __be32 *dst = (__be32 *)out;
136 __be64 bits;
137 static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
138
139 bits = cpu_to_be64(sctx->count << 3);
140
141 /* Pad out to 56 mod 64 and append length */
142 index = sctx->count % SHA256_BLOCK_SIZE;
143 padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
144
145 if (!irq_fpu_usable()) {
146 crypto_sha256_update(desc, padding, padlen);
147 crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
148 } else {
149 kernel_fpu_begin();
150 /* We need to fill a whole block for __sha256_ssse3_update() */
151 if (padlen <= 56) {
152 sctx->count += padlen;
153 memcpy(sctx->buf + index, padding, padlen);
154 } else {
155 __sha256_ssse3_update(desc, padding, padlen, index);
156 }
157 __sha256_ssse3_update(desc, (const u8 *)&bits,
158 sizeof(bits), 56);
159 kernel_fpu_end();
160 }
161
162 /* Store state in digest */
163 for (i = 0; i < 8; i++)
164 dst[i] = cpu_to_be32(sctx->state[i]);
165
166 /* Wipe context */
167 memset(sctx, 0, sizeof(*sctx));
168 74
169 return 0; 75 return 0;
170} 76}
171 77
172static int sha256_ssse3_export(struct shash_desc *desc, void *out) 78static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
79 unsigned int len, u8 *out)
173{ 80{
174 struct sha256_state *sctx = shash_desc_ctx(desc); 81 if (!irq_fpu_usable())
82 return crypto_sha256_finup(desc, data, len, out);
175 83
176 memcpy(out, sctx, sizeof(*sctx)); 84 kernel_fpu_begin();
85 if (len)
86 sha256_base_do_update(desc, data, len,
87 (sha256_block_fn *)sha256_transform_asm);
88 sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm);
89 kernel_fpu_end();
177 90
178 return 0; 91 return sha256_base_finish(desc, out);
179} 92}
180 93
181static int sha256_ssse3_import(struct shash_desc *desc, const void *in) 94/* Add padding and return the message digest. */
182{ 95static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
183 struct sha256_state *sctx = shash_desc_ctx(desc);
184
185 memcpy(sctx, in, sizeof(*sctx));
186
187 return 0;
188}
189
190static int sha224_ssse3_init(struct shash_desc *desc)
191{
192 struct sha256_state *sctx = shash_desc_ctx(desc);
193
194 sctx->state[0] = SHA224_H0;
195 sctx->state[1] = SHA224_H1;
196 sctx->state[2] = SHA224_H2;
197 sctx->state[3] = SHA224_H3;
198 sctx->state[4] = SHA224_H4;
199 sctx->state[5] = SHA224_H5;
200 sctx->state[6] = SHA224_H6;
201 sctx->state[7] = SHA224_H7;
202 sctx->count = 0;
203
204 return 0;
205}
206
207static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
208{ 96{
209 u8 D[SHA256_DIGEST_SIZE]; 97 return sha256_ssse3_finup(desc, NULL, 0, out);
210
211 sha256_ssse3_final(desc, D);
212
213 memcpy(hash, D, SHA224_DIGEST_SIZE);
214 memzero_explicit(D, SHA256_DIGEST_SIZE);
215
216 return 0;
217} 98}
218 99
219static struct shash_alg algs[] = { { 100static struct shash_alg algs[] = { {
220 .digestsize = SHA256_DIGEST_SIZE, 101 .digestsize = SHA256_DIGEST_SIZE,
221 .init = sha256_ssse3_init, 102 .init = sha256_base_init,
222 .update = sha256_ssse3_update, 103 .update = sha256_ssse3_update,
223 .final = sha256_ssse3_final, 104 .final = sha256_ssse3_final,
224 .export = sha256_ssse3_export, 105 .finup = sha256_ssse3_finup,
225 .import = sha256_ssse3_import,
226 .descsize = sizeof(struct sha256_state), 106 .descsize = sizeof(struct sha256_state),
227 .statesize = sizeof(struct sha256_state),
228 .base = { 107 .base = {
229 .cra_name = "sha256", 108 .cra_name = "sha256",
230 .cra_driver_name = "sha256-ssse3", 109 .cra_driver_name = "sha256-ssse3",
@@ -235,13 +114,11 @@ static struct shash_alg algs[] = { {
235 } 114 }
236}, { 115}, {
237 .digestsize = SHA224_DIGEST_SIZE, 116 .digestsize = SHA224_DIGEST_SIZE,
238 .init = sha224_ssse3_init, 117 .init = sha224_base_init,
239 .update = sha256_ssse3_update, 118 .update = sha256_ssse3_update,
240 .final = sha224_ssse3_final, 119 .final = sha256_ssse3_final,
241 .export = sha256_ssse3_export, 120 .finup = sha256_ssse3_finup,
242 .import = sha256_ssse3_import,
243 .descsize = sizeof(struct sha256_state), 121 .descsize = sizeof(struct sha256_state),
244 .statesize = sizeof(struct sha256_state),
245 .base = { 122 .base = {
246 .cra_name = "sha224", 123 .cra_name = "sha224",
247 .cra_driver_name = "sha224-ssse3", 124 .cra_driver_name = "sha224-ssse3",
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 974dde9bc6cd..565274d6a641 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -54,9 +54,9 @@
54 54
55# Virtual Registers 55# Virtual Registers
56# ARG1 56# ARG1
57msg = %rdi 57digest = %rdi
58# ARG2 58# ARG2
59digest = %rsi 59msg = %rsi
60# ARG3 60# ARG3
61msglen = %rdx 61msglen = %rdx
62T1 = %rcx 62T1 = %rcx
@@ -271,7 +271,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
271.endm 271.endm
272 272
273######################################################################## 273########################################################################
274# void sha512_transform_avx(const void* M, void* D, u64 L) 274# void sha512_transform_avx(void* D, const void* M, u64 L)
275# Purpose: Updates the SHA512 digest stored at D with the message stored in M. 275# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
276# The size of the message pointed to by M must be an integer multiple of SHA512 276# The size of the message pointed to by M must be an integer multiple of SHA512
277# message blocks. 277# message blocks.
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 568b96105f5c..a4771dcd1fcf 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -70,9 +70,9 @@ XFER = YTMP0
70BYTE_FLIP_MASK = %ymm9 70BYTE_FLIP_MASK = %ymm9
71 71
72# 1st arg 72# 1st arg
73INP = %rdi 73CTX = %rdi
74# 2nd arg 74# 2nd arg
75CTX = %rsi 75INP = %rsi
76# 3rd arg 76# 3rd arg
77NUM_BLKS = %rdx 77NUM_BLKS = %rdx
78 78
@@ -562,7 +562,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
562.endm 562.endm
563 563
564######################################################################## 564########################################################################
565# void sha512_transform_rorx(const void* M, void* D, uint64_t L)# 565# void sha512_transform_rorx(void* D, const void* M, uint64_t L)#
566# Purpose: Updates the SHA512 digest stored at D with the message stored in M. 566# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
567# The size of the message pointed to by M must be an integer multiple of SHA512 567# The size of the message pointed to by M must be an integer multiple of SHA512
568# message blocks. 568# message blocks.
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index fb56855d51f5..e610e29cbc81 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -53,9 +53,9 @@
53 53
54# Virtual Registers 54# Virtual Registers
55# ARG1 55# ARG1
56msg = %rdi 56digest = %rdi
57# ARG2 57# ARG2
58digest = %rsi 58msg = %rsi
59# ARG3 59# ARG3
60msglen = %rdx 60msglen = %rdx
61T1 = %rcx 61T1 = %rcx
@@ -269,7 +269,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
269.endm 269.endm
270 270
271######################################################################## 271########################################################################
272# void sha512_transform_ssse3(const void* M, void* D, u64 L)# 272# void sha512_transform_ssse3(void* D, const void* M, u64 L)#
273# Purpose: Updates the SHA512 digest stored at D with the message stored in M. 273# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
274# The size of the message pointed to by M must be an integer multiple of SHA512 274# The size of the message pointed to by M must be an integer multiple of SHA512
275# message blocks. 275# message blocks.
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0b6af26832bf..d9fa4c1e063f 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -34,205 +34,75 @@
34#include <linux/cryptohash.h> 34#include <linux/cryptohash.h>
35#include <linux/types.h> 35#include <linux/types.h>
36#include <crypto/sha.h> 36#include <crypto/sha.h>
37#include <asm/byteorder.h> 37#include <crypto/sha512_base.h>
38#include <asm/i387.h> 38#include <asm/i387.h>
39#include <asm/xcr.h> 39#include <asm/xcr.h>
40#include <asm/xsave.h> 40#include <asm/xsave.h>
41 41
42#include <linux/string.h> 42#include <linux/string.h>
43 43
44asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest, 44asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
45 u64 rounds); 45 u64 rounds);
46#ifdef CONFIG_AS_AVX 46#ifdef CONFIG_AS_AVX
47asmlinkage void sha512_transform_avx(const char *data, u64 *digest, 47asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
48 u64 rounds); 48 u64 rounds);
49#endif 49#endif
50#ifdef CONFIG_AS_AVX2 50#ifdef CONFIG_AS_AVX2
51asmlinkage void sha512_transform_rorx(const char *data, u64 *digest, 51asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
52 u64 rounds); 52 u64 rounds);
53#endif 53#endif
54 54
55static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64); 55static void (*sha512_transform_asm)(u64 *, const char *, u64);
56
57
58static int sha512_ssse3_init(struct shash_desc *desc)
59{
60 struct sha512_state *sctx = shash_desc_ctx(desc);
61
62 sctx->state[0] = SHA512_H0;
63 sctx->state[1] = SHA512_H1;
64 sctx->state[2] = SHA512_H2;
65 sctx->state[3] = SHA512_H3;
66 sctx->state[4] = SHA512_H4;
67 sctx->state[5] = SHA512_H5;
68 sctx->state[6] = SHA512_H6;
69 sctx->state[7] = SHA512_H7;
70 sctx->count[0] = sctx->count[1] = 0;
71
72 return 0;
73}
74 56
75static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data, 57static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
76 unsigned int len, unsigned int partial) 58 unsigned int len)
77{ 59{
78 struct sha512_state *sctx = shash_desc_ctx(desc); 60 struct sha512_state *sctx = shash_desc_ctx(desc);
79 unsigned int done = 0;
80
81 sctx->count[0] += len;
82 if (sctx->count[0] < len)
83 sctx->count[1]++;
84 61
85 if (partial) { 62 if (!irq_fpu_usable() ||
86 done = SHA512_BLOCK_SIZE - partial; 63 (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
87 memcpy(sctx->buf + partial, data, done); 64 return crypto_sha512_update(desc, data, len);
88 sha512_transform_asm(sctx->buf, sctx->state, 1);
89 }
90
91 if (len - done >= SHA512_BLOCK_SIZE) {
92 const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
93 65
94 sha512_transform_asm(data + done, sctx->state, (u64) rounds); 66 /* make sure casting to sha512_block_fn() is safe */
95 67 BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
96 done += rounds * SHA512_BLOCK_SIZE;
97 }
98 68
99 memcpy(sctx->buf, data + done, len - done); 69 kernel_fpu_begin();
70 sha512_base_do_update(desc, data, len,
71 (sha512_block_fn *)sha512_transform_asm);
72 kernel_fpu_end();
100 73
101 return 0; 74 return 0;
102} 75}
103 76
104static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, 77static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
105 unsigned int len) 78 unsigned int len, u8 *out)
106{ 79{
107 struct sha512_state *sctx = shash_desc_ctx(desc); 80 if (!irq_fpu_usable())
108 unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; 81 return crypto_sha512_finup(desc, data, len, out);
109 int res;
110
111 /* Handle the fast case right here */
112 if (partial + len < SHA512_BLOCK_SIZE) {
113 sctx->count[0] += len;
114 if (sctx->count[0] < len)
115 sctx->count[1]++;
116 memcpy(sctx->buf + partial, data, len);
117
118 return 0;
119 }
120 82
121 if (!irq_fpu_usable()) { 83 kernel_fpu_begin();
122 res = crypto_sha512_update(desc, data, len); 84 if (len)
123 } else { 85 sha512_base_do_update(desc, data, len,
124 kernel_fpu_begin(); 86 (sha512_block_fn *)sha512_transform_asm);
125 res = __sha512_ssse3_update(desc, data, len, partial); 87 sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm);
126 kernel_fpu_end(); 88 kernel_fpu_end();
127 }
128 89
129 return res; 90 return sha512_base_finish(desc, out);
130} 91}
131 92
132
133/* Add padding and return the message digest. */ 93/* Add padding and return the message digest. */
134static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) 94static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
135{ 95{
136 struct sha512_state *sctx = shash_desc_ctx(desc); 96 return sha512_ssse3_finup(desc, NULL, 0, out);
137 unsigned int i, index, padlen;
138 __be64 *dst = (__be64 *)out;
139 __be64 bits[2];
140 static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
141
142 /* save number of bits */
143 bits[1] = cpu_to_be64(sctx->count[0] << 3);
144 bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
145
146 /* Pad out to 112 mod 128 and append length */
147 index = sctx->count[0] & 0x7f;
148 padlen = (index < 112) ? (112 - index) : ((128+112) - index);
149
150 if (!irq_fpu_usable()) {
151 crypto_sha512_update(desc, padding, padlen);
152 crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
153 } else {
154 kernel_fpu_begin();
155 /* We need to fill a whole block for __sha512_ssse3_update() */
156 if (padlen <= 112) {
157 sctx->count[0] += padlen;
158 if (sctx->count[0] < padlen)
159 sctx->count[1]++;
160 memcpy(sctx->buf + index, padding, padlen);
161 } else {
162 __sha512_ssse3_update(desc, padding, padlen, index);
163 }
164 __sha512_ssse3_update(desc, (const u8 *)&bits,
165 sizeof(bits), 112);
166 kernel_fpu_end();
167 }
168
169 /* Store state in digest */
170 for (i = 0; i < 8; i++)
171 dst[i] = cpu_to_be64(sctx->state[i]);
172
173 /* Wipe context */
174 memset(sctx, 0, sizeof(*sctx));
175
176 return 0;
177}
178
179static int sha512_ssse3_export(struct shash_desc *desc, void *out)
180{
181 struct sha512_state *sctx = shash_desc_ctx(desc);
182
183 memcpy(out, sctx, sizeof(*sctx));
184
185 return 0;
186}
187
188static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
189{
190 struct sha512_state *sctx = shash_desc_ctx(desc);
191
192 memcpy(sctx, in, sizeof(*sctx));
193
194 return 0;
195}
196
197static int sha384_ssse3_init(struct shash_desc *desc)
198{
199 struct sha512_state *sctx = shash_desc_ctx(desc);
200
201 sctx->state[0] = SHA384_H0;
202 sctx->state[1] = SHA384_H1;
203 sctx->state[2] = SHA384_H2;
204 sctx->state[3] = SHA384_H3;
205 sctx->state[4] = SHA384_H4;
206 sctx->state[5] = SHA384_H5;
207 sctx->state[6] = SHA384_H6;
208 sctx->state[7] = SHA384_H7;
209
210 sctx->count[0] = sctx->count[1] = 0;
211
212 return 0;
213}
214
215static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
216{
217 u8 D[SHA512_DIGEST_SIZE];
218
219 sha512_ssse3_final(desc, D);
220
221 memcpy(hash, D, SHA384_DIGEST_SIZE);
222 memzero_explicit(D, SHA512_DIGEST_SIZE);
223
224 return 0;
225} 97}
226 98
227static struct shash_alg algs[] = { { 99static struct shash_alg algs[] = { {
228 .digestsize = SHA512_DIGEST_SIZE, 100 .digestsize = SHA512_DIGEST_SIZE,
229 .init = sha512_ssse3_init, 101 .init = sha512_base_init,
230 .update = sha512_ssse3_update, 102 .update = sha512_ssse3_update,
231 .final = sha512_ssse3_final, 103 .final = sha512_ssse3_final,
232 .export = sha512_ssse3_export, 104 .finup = sha512_ssse3_finup,
233 .import = sha512_ssse3_import,
234 .descsize = sizeof(struct sha512_state), 105 .descsize = sizeof(struct sha512_state),
235 .statesize = sizeof(struct sha512_state),
236 .base = { 106 .base = {
237 .cra_name = "sha512", 107 .cra_name = "sha512",
238 .cra_driver_name = "sha512-ssse3", 108 .cra_driver_name = "sha512-ssse3",
@@ -243,13 +113,11 @@ static struct shash_alg algs[] = { {
243 } 113 }
244}, { 114}, {
245 .digestsize = SHA384_DIGEST_SIZE, 115 .digestsize = SHA384_DIGEST_SIZE,
246 .init = sha384_ssse3_init, 116 .init = sha384_base_init,
247 .update = sha512_ssse3_update, 117 .update = sha512_ssse3_update,
248 .final = sha384_ssse3_final, 118 .final = sha512_ssse3_final,
249 .export = sha512_ssse3_export, 119 .finup = sha512_ssse3_finup,
250 .import = sha512_ssse3_import,
251 .descsize = sizeof(struct sha512_state), 120 .descsize = sizeof(struct sha512_state),
252 .statesize = sizeof(struct sha512_state),
253 .base = { 121 .base = {
254 .cra_name = "sha384", 122 .cra_name = "sha384",
255 .cra_driver_name = "sha384-ssse3", 123 .cra_driver_name = "sha384-ssse3",
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index a039d21986a2..a350c990dc86 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk)
264 movq R1, 8(%rsi) 264 movq R1, 8(%rsi)
265 265
266 popq R1 266 popq R1
267 movq $1,%rax 267 movl $1,%eax
268 ret 268 ret
269ENDPROC(twofish_enc_blk) 269ENDPROC(twofish_enc_blk)
270 270
@@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk)
316 movq R1, 8(%rsi) 316 movq R1, 8(%rsi)
317 317
318 popq R1 318 popq R1
319 movq $1,%rax 319 movl $1,%eax
320 ret 320 ret
321ENDPROC(twofish_dec_blk) 321ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 1ac531ea9bcc..b5e2d5651851 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -340,7 +340,8 @@ static struct crypto_alg twofish_algs[10] = { {
340 .cra_name = "__ecb-twofish-avx", 340 .cra_name = "__ecb-twofish-avx",
341 .cra_driver_name = "__driver-ecb-twofish-avx", 341 .cra_driver_name = "__driver-ecb-twofish-avx",
342 .cra_priority = 0, 342 .cra_priority = 0,
343 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 343 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
344 CRYPTO_ALG_INTERNAL,
344 .cra_blocksize = TF_BLOCK_SIZE, 345 .cra_blocksize = TF_BLOCK_SIZE,
345 .cra_ctxsize = sizeof(struct twofish_ctx), 346 .cra_ctxsize = sizeof(struct twofish_ctx),
346 .cra_alignmask = 0, 347 .cra_alignmask = 0,
@@ -359,7 +360,8 @@ static struct crypto_alg twofish_algs[10] = { {
359 .cra_name = "__cbc-twofish-avx", 360 .cra_name = "__cbc-twofish-avx",
360 .cra_driver_name = "__driver-cbc-twofish-avx", 361 .cra_driver_name = "__driver-cbc-twofish-avx",
361 .cra_priority = 0, 362 .cra_priority = 0,
362 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 363 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
364 CRYPTO_ALG_INTERNAL,
363 .cra_blocksize = TF_BLOCK_SIZE, 365 .cra_blocksize = TF_BLOCK_SIZE,
364 .cra_ctxsize = sizeof(struct twofish_ctx), 366 .cra_ctxsize = sizeof(struct twofish_ctx),
365 .cra_alignmask = 0, 367 .cra_alignmask = 0,
@@ -378,7 +380,8 @@ static struct crypto_alg twofish_algs[10] = { {
378 .cra_name = "__ctr-twofish-avx", 380 .cra_name = "__ctr-twofish-avx",
379 .cra_driver_name = "__driver-ctr-twofish-avx", 381 .cra_driver_name = "__driver-ctr-twofish-avx",
380 .cra_priority = 0, 382 .cra_priority = 0,
381 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 383 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
384 CRYPTO_ALG_INTERNAL,
382 .cra_blocksize = 1, 385 .cra_blocksize = 1,
383 .cra_ctxsize = sizeof(struct twofish_ctx), 386 .cra_ctxsize = sizeof(struct twofish_ctx),
384 .cra_alignmask = 0, 387 .cra_alignmask = 0,
@@ -398,7 +401,8 @@ static struct crypto_alg twofish_algs[10] = { {
398 .cra_name = "__lrw-twofish-avx", 401 .cra_name = "__lrw-twofish-avx",
399 .cra_driver_name = "__driver-lrw-twofish-avx", 402 .cra_driver_name = "__driver-lrw-twofish-avx",
400 .cra_priority = 0, 403 .cra_priority = 0,
401 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 404 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
405 CRYPTO_ALG_INTERNAL,
402 .cra_blocksize = TF_BLOCK_SIZE, 406 .cra_blocksize = TF_BLOCK_SIZE,
403 .cra_ctxsize = sizeof(struct twofish_lrw_ctx), 407 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
404 .cra_alignmask = 0, 408 .cra_alignmask = 0,
@@ -421,7 +425,8 @@ static struct crypto_alg twofish_algs[10] = { {
421 .cra_name = "__xts-twofish-avx", 425 .cra_name = "__xts-twofish-avx",
422 .cra_driver_name = "__driver-xts-twofish-avx", 426 .cra_driver_name = "__driver-xts-twofish-avx",
423 .cra_priority = 0, 427 .cra_priority = 0,
424 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 428 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
429 CRYPTO_ALG_INTERNAL,
425 .cra_blocksize = TF_BLOCK_SIZE, 430 .cra_blocksize = TF_BLOCK_SIZE,
426 .cra_ctxsize = sizeof(struct twofish_xts_ctx), 431 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
427 .cra_alignmask = 0, 432 .cra_alignmask = 0,
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e785b422b766..bb635c641869 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,6 @@
3# 3#
4 4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o 5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
6obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
7 6
8obj-$(CONFIG_IA32_AOUT) += ia32_aout.o 7obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
9 8
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index d0165c9a2932..c81d35e6c7f1 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
161} 161}
162 162
163static int ia32_restore_sigcontext(struct pt_regs *regs, 163static int ia32_restore_sigcontext(struct pt_regs *regs,
164 struct sigcontext_ia32 __user *sc, 164 struct sigcontext_ia32 __user *sc)
165 unsigned int *pax)
166{ 165{
167 unsigned int tmpflags, err = 0; 166 unsigned int tmpflags, err = 0;
168 void __user *buf; 167 void __user *buf;
@@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
184 RELOAD_SEG(es); 183 RELOAD_SEG(es);
185 184
186 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 185 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
187 COPY(dx); COPY(cx); COPY(ip); 186 COPY(dx); COPY(cx); COPY(ip); COPY(ax);
188 /* Don't touch extended registers */ 187 /* Don't touch extended registers */
189 188
190 COPY_SEG_CPL3(cs); 189 COPY_SEG_CPL3(cs);
@@ -197,12 +196,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
197 196
198 get_user_ex(tmp, &sc->fpstate); 197 get_user_ex(tmp, &sc->fpstate);
199 buf = compat_ptr(tmp); 198 buf = compat_ptr(tmp);
200
201 get_user_ex(*pax, &sc->ax);
202 } get_user_catch(err); 199 } get_user_catch(err);
203 200
204 err |= restore_xstate_sig(buf, 1); 201 err |= restore_xstate_sig(buf, 1);
205 202
203 force_iret();
204
206 return err; 205 return err;
207} 206}
208 207
@@ -211,7 +210,6 @@ asmlinkage long sys32_sigreturn(void)
211 struct pt_regs *regs = current_pt_regs(); 210 struct pt_regs *regs = current_pt_regs();
212 struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); 211 struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
213 sigset_t set; 212 sigset_t set;
214 unsigned int ax;
215 213
216 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 214 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
217 goto badframe; 215 goto badframe;
@@ -224,9 +222,9 @@ asmlinkage long sys32_sigreturn(void)
224 222
225 set_current_blocked(&set); 223 set_current_blocked(&set);
226 224
227 if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) 225 if (ia32_restore_sigcontext(regs, &frame->sc))
228 goto badframe; 226 goto badframe;
229 return ax; 227 return regs->ax;
230 228
231badframe: 229badframe:
232 signal_fault(regs, frame, "32bit sigreturn"); 230 signal_fault(regs, frame, "32bit sigreturn");
@@ -238,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void)
238 struct pt_regs *regs = current_pt_regs(); 236 struct pt_regs *regs = current_pt_regs();
239 struct rt_sigframe_ia32 __user *frame; 237 struct rt_sigframe_ia32 __user *frame;
240 sigset_t set; 238 sigset_t set;
241 unsigned int ax;
242 239
243 frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); 240 frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
244 241
@@ -249,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void)
249 246
250 set_current_blocked(&set); 247 set_current_blocked(&set);
251 248
252 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 249 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
253 goto badframe; 250 goto badframe;
254 251
255 if (compat_restore_altstack(&frame->uc.uc_stack)) 252 if (compat_restore_altstack(&frame->uc.uc_stack))
256 goto badframe; 253 goto badframe;
257 254
258 return ax; 255 return regs->ax;
259 256
260badframe: 257badframe:
261 signal_fault(regs, frame, "32bit rt sigreturn"); 258 signal_fault(regs, frame, "32bit rt sigreturn");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebcab4ada..a821b1cd4fa7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -30,24 +30,13 @@
30 30
31 .section .entry.text, "ax" 31 .section .entry.text, "ax"
32 32
33 .macro IA32_ARG_FIXUP noebp=0 33 /* clobbers %rax */
34 movl %edi,%r8d 34 .macro CLEAR_RREGS _r9=rax
35 .if \noebp
36 .else
37 movl %ebp,%r9d
38 .endif
39 xchg %ecx,%esi
40 movl %ebx,%edi
41 movl %edx,%edx /* zero extension */
42 .endm
43
44 /* clobbers %eax */
45 .macro CLEAR_RREGS offset=0, _r9=rax
46 xorl %eax,%eax 35 xorl %eax,%eax
47 movq %rax,\offset+R11(%rsp) 36 movq %rax,R11(%rsp)
48 movq %rax,\offset+R10(%rsp) 37 movq %rax,R10(%rsp)
49 movq %\_r9,\offset+R9(%rsp) 38 movq %\_r9,R9(%rsp)
50 movq %rax,\offset+R8(%rsp) 39 movq %rax,R8(%rsp)
51 .endm 40 .endm
52 41
53 /* 42 /*
@@ -60,14 +49,14 @@
60 * If it's -1 to make us punt the syscall, then (u32)-1 is still 49 * If it's -1 to make us punt the syscall, then (u32)-1 is still
61 * an appropriately invalid value. 50 * an appropriately invalid value.
62 */ 51 */
63 .macro LOAD_ARGS32 offset, _r9=0 52 .macro LOAD_ARGS32 _r9=0
64 .if \_r9 53 .if \_r9
65 movl \offset+16(%rsp),%r9d 54 movl R9(%rsp),%r9d
66 .endif 55 .endif
67 movl \offset+40(%rsp),%ecx 56 movl RCX(%rsp),%ecx
68 movl \offset+48(%rsp),%edx 57 movl RDX(%rsp),%edx
69 movl \offset+56(%rsp),%esi 58 movl RSI(%rsp),%esi
70 movl \offset+64(%rsp),%edi 59 movl RDI(%rsp),%edi
71 movl %eax,%eax /* zero extension */ 60 movl %eax,%eax /* zero extension */
72 .endm 61 .endm
73 62
@@ -99,54 +88,69 @@ ENDPROC(native_irq_enable_sysexit)
99/* 88/*
100 * 32bit SYSENTER instruction entry. 89 * 32bit SYSENTER instruction entry.
101 * 90 *
91 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
92 * IF and VM in rflags are cleared (IOW: interrupts are off).
93 * SYSENTER does not save anything on the stack,
94 * and does not save old rip (!!!) and rflags.
95 *
102 * Arguments: 96 * Arguments:
103 * %eax System call number. 97 * eax system call number
104 * %ebx Arg1 98 * ebx arg1
105 * %ecx Arg2 99 * ecx arg2
106 * %edx Arg3 100 * edx arg3
107 * %esi Arg4 101 * esi arg4
108 * %edi Arg5 102 * edi arg5
109 * %ebp user stack 103 * ebp user stack
110 * 0(%ebp) Arg6 104 * 0(%ebp) arg6
111 * 105 *
112 * Interrupts off.
113 *
114 * This is purely a fast path. For anything complicated we use the int 0x80 106 * This is purely a fast path. For anything complicated we use the int 0x80
115 * path below. Set up a complete hardware stack frame to share code 107 * path below. We set up a complete hardware stack frame to share code
116 * with the int 0x80 path. 108 * with the int 0x80 path.
117 */ 109 */
118ENTRY(ia32_sysenter_target) 110ENTRY(ia32_sysenter_target)
119 CFI_STARTPROC32 simple 111 CFI_STARTPROC32 simple
120 CFI_SIGNAL_FRAME 112 CFI_SIGNAL_FRAME
121 CFI_DEF_CFA rsp,0 113 CFI_DEF_CFA rsp,0
122 CFI_REGISTER rsp,rbp 114 CFI_REGISTER rsp,rbp
123 SWAPGS_UNSAFE_STACK 115
124 movq PER_CPU_VAR(kernel_stack), %rsp
125 addq $(KERNEL_STACK_OFFSET),%rsp
126 /* 116 /*
127 * No need to follow this irqs on/off section: the syscall 117 * Interrupts are off on entry.
128 * disabled irqs, here we enable it straight after entry: 118 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
119 * it is too small to ever cause noticeable irq latency.
129 */ 120 */
121 SWAPGS_UNSAFE_STACK
122 movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
130 ENABLE_INTERRUPTS(CLBR_NONE) 123 ENABLE_INTERRUPTS(CLBR_NONE)
131 movl %ebp,%ebp /* zero extension */ 124
132 pushq_cfi $__USER32_DS 125 /* Zero-extending 32-bit regs, do not remove */
133 /*CFI_REL_OFFSET ss,0*/ 126 movl %ebp, %ebp
134 pushq_cfi %rbp
135 CFI_REL_OFFSET rsp,0
136 pushfq_cfi
137 /*CFI_REL_OFFSET rflags,0*/
138 movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
139 CFI_REGISTER rip,r10
140 pushq_cfi $__USER32_CS
141 /*CFI_REL_OFFSET cs,0*/
142 movl %eax, %eax 127 movl %eax, %eax
143 pushq_cfi %r10 128
144 CFI_REL_OFFSET rip,0 129 movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
145 pushq_cfi %rax 130 CFI_REGISTER rip,r10
131
132 /* Construct struct pt_regs on stack */
133 pushq_cfi $__USER32_DS /* pt_regs->ss */
134 pushq_cfi %rbp /* pt_regs->sp */
135 CFI_REL_OFFSET rsp,0
136 pushfq_cfi /* pt_regs->flags */
137 pushq_cfi $__USER32_CS /* pt_regs->cs */
138 pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */
139 CFI_REL_OFFSET rip,0
140 pushq_cfi_reg rax /* pt_regs->orig_ax */
141 pushq_cfi_reg rdi /* pt_regs->di */
142 pushq_cfi_reg rsi /* pt_regs->si */
143 pushq_cfi_reg rdx /* pt_regs->dx */
144 pushq_cfi_reg rcx /* pt_regs->cx */
145 pushq_cfi_reg rax /* pt_regs->ax */
146 cld 146 cld
147 SAVE_ARGS 0,1,0 147 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
148 /* no need to do an access_ok check here because rbp has been 148 CFI_ADJUST_CFA_OFFSET 10*8
149 32bit zero extended */ 149
150 /*
151 * no need to do an access_ok check here because rbp has been
152 * 32bit zero extended
153 */
150 ASM_STAC 154 ASM_STAC
1511: movl (%rbp),%ebp 1551: movl (%rbp),%ebp
152 _ASM_EXTABLE(1b,ia32_badarg) 156 _ASM_EXTABLE(1b,ia32_badarg)
@@ -157,42 +161,80 @@ ENTRY(ia32_sysenter_target)
157 * ourselves. To save a few cycles, we can check whether 161 * ourselves. To save a few cycles, we can check whether
158 * NT was set instead of doing an unconditional popfq. 162 * NT was set instead of doing an unconditional popfq.
159 */ 163 */
160 testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) 164 testl $X86_EFLAGS_NT,EFLAGS(%rsp)
161 jnz sysenter_fix_flags 165 jnz sysenter_fix_flags
162sysenter_flags_fixed: 166sysenter_flags_fixed:
163 167
164 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 168 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
165 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 169 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
166 CFI_REMEMBER_STATE 170 CFI_REMEMBER_STATE
167 jnz sysenter_tracesys 171 jnz sysenter_tracesys
168 cmpq $(IA32_NR_syscalls-1),%rax 172 cmpq $(IA32_NR_syscalls-1),%rax
169 ja ia32_badsys 173 ja ia32_badsys
170sysenter_do_call: 174sysenter_do_call:
171 IA32_ARG_FIXUP 175 /* 32bit syscall -> 64bit C ABI argument conversion */
176 movl %edi,%r8d /* arg5 */
177 movl %ebp,%r9d /* arg6 */
178 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
179 movl %ebx,%edi /* arg1 */
180 movl %edx,%edx /* arg3 (zero extension) */
172sysenter_dispatch: 181sysenter_dispatch:
173 call *ia32_sys_call_table(,%rax,8) 182 call *ia32_sys_call_table(,%rax,8)
174 movq %rax,RAX-ARGOFFSET(%rsp) 183 movq %rax,RAX(%rsp)
175 DISABLE_INTERRUPTS(CLBR_NONE) 184 DISABLE_INTERRUPTS(CLBR_NONE)
176 TRACE_IRQS_OFF 185 TRACE_IRQS_OFF
177 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 186 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
178 jnz sysexit_audit 187 jnz sysexit_audit
179sysexit_from_sys_call: 188sysexit_from_sys_call:
180 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 189 /*
181 /* clear IF, that popfq doesn't enable interrupts early */ 190 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
182 andl $~0x200,EFLAGS-ARGOFFSET(%rsp) 191 * NMI between STI and SYSEXIT has poorly specified behavior,
183 movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ 192 * and and NMI followed by an IRQ with usergs is fatal. So
184 CFI_REGISTER rip,rdx 193 * we just pretend we're using SYSEXIT but we really use
185 RESTORE_ARGS 0,24,0,0,0,0 194 * SYSRETL instead.
195 *
196 * This code path is still called 'sysexit' because it pairs
197 * with 'sysenter' and it uses the SYSENTER calling convention.
198 */
199 andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
200 movl RIP(%rsp),%ecx /* User %eip */
201 CFI_REGISTER rip,rcx
202 RESTORE_RSI_RDI
203 xorl %edx,%edx /* avoid info leaks */
186 xorq %r8,%r8 204 xorq %r8,%r8
187 xorq %r9,%r9 205 xorq %r9,%r9
188 xorq %r10,%r10 206 xorq %r10,%r10
189 xorq %r11,%r11 207 movl EFLAGS(%rsp),%r11d /* User eflags */
190 popfq_cfi
191 /*CFI_RESTORE rflags*/ 208 /*CFI_RESTORE rflags*/
192 popq_cfi %rcx /* User %esp */
193 CFI_REGISTER rsp,rcx
194 TRACE_IRQS_ON 209 TRACE_IRQS_ON
195 ENABLE_INTERRUPTS_SYSEXIT32 210
211 /*
212 * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
213 * since it avoids a dicey window with interrupts enabled.
214 */
215 movl RSP(%rsp),%esp
216
217 /*
218 * USERGS_SYSRET32 does:
219 * gsbase = user's gs base
220 * eip = ecx
221 * rflags = r11
222 * cs = __USER32_CS
223 * ss = __USER_DS
224 *
225 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
226 *
227 * pop %ebp
228 * pop %edx
229 * pop %ecx
230 *
231 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
232 * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
233 * address (already known to user code), and R12-R15 are
234 * callee-saved and therefore don't contain any interesting
235 * kernel data.
236 */
237 USERGS_SYSRET32
196 238
197 CFI_RESTORE_STATE 239 CFI_RESTORE_STATE
198 240
@@ -205,18 +247,18 @@ sysexit_from_sys_call:
205 movl %ebx,%esi /* 2nd arg: 1st syscall arg */ 247 movl %ebx,%esi /* 2nd arg: 1st syscall arg */
206 movl %eax,%edi /* 1st arg: syscall number */ 248 movl %eax,%edi /* 1st arg: syscall number */
207 call __audit_syscall_entry 249 call __audit_syscall_entry
208 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ 250 movl RAX(%rsp),%eax /* reload syscall number */
209 cmpq $(IA32_NR_syscalls-1),%rax 251 cmpq $(IA32_NR_syscalls-1),%rax
210 ja ia32_badsys 252 ja ia32_badsys
211 movl %ebx,%edi /* reload 1st syscall arg */ 253 movl %ebx,%edi /* reload 1st syscall arg */
212 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ 254 movl RCX(%rsp),%esi /* reload 2nd syscall arg */
213 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ 255 movl RDX(%rsp),%edx /* reload 3rd syscall arg */
214 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ 256 movl RSI(%rsp),%ecx /* reload 4th syscall arg */
215 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ 257 movl RDI(%rsp),%r8d /* reload 5th syscall arg */
216 .endm 258 .endm
217 259
218 .macro auditsys_exit exit 260 .macro auditsys_exit exit
219 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 261 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
220 jnz ia32_ret_from_sys_call 262 jnz ia32_ret_from_sys_call
221 TRACE_IRQS_ON 263 TRACE_IRQS_ON
222 ENABLE_INTERRUPTS(CLBR_NONE) 264 ENABLE_INTERRUPTS(CLBR_NONE)
@@ -227,13 +269,13 @@ sysexit_from_sys_call:
2271: setbe %al /* 1 if error, 0 if not */ 2691: setbe %al /* 1 if error, 0 if not */
228 movzbl %al,%edi /* zero-extend that into %edi */ 270 movzbl %al,%edi /* zero-extend that into %edi */
229 call __audit_syscall_exit 271 call __audit_syscall_exit
230 movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ 272 movq RAX(%rsp),%rax /* reload syscall return value */
231 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 273 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
232 DISABLE_INTERRUPTS(CLBR_NONE) 274 DISABLE_INTERRUPTS(CLBR_NONE)
233 TRACE_IRQS_OFF 275 TRACE_IRQS_OFF
234 testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 276 testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
235 jz \exit 277 jz \exit
236 CLEAR_RREGS -ARGOFFSET 278 CLEAR_RREGS
237 jmp int_with_check 279 jmp int_with_check
238 .endm 280 .endm
239 281
@@ -253,16 +295,16 @@ sysenter_fix_flags:
253 295
254sysenter_tracesys: 296sysenter_tracesys:
255#ifdef CONFIG_AUDITSYSCALL 297#ifdef CONFIG_AUDITSYSCALL
256 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 298 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
257 jz sysenter_auditsys 299 jz sysenter_auditsys
258#endif 300#endif
259 SAVE_REST 301 SAVE_EXTRA_REGS
260 CLEAR_RREGS 302 CLEAR_RREGS
261 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ 303 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
262 movq %rsp,%rdi /* &pt_regs -> arg1 */ 304 movq %rsp,%rdi /* &pt_regs -> arg1 */
263 call syscall_trace_enter 305 call syscall_trace_enter
264 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 306 LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
265 RESTORE_REST 307 RESTORE_EXTRA_REGS
266 cmpq $(IA32_NR_syscalls-1),%rax 308 cmpq $(IA32_NR_syscalls-1),%rax
267 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ 309 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
268 jmp sysenter_do_call 310 jmp sysenter_do_call
@@ -272,94 +314,128 @@ ENDPROC(ia32_sysenter_target)
272/* 314/*
273 * 32bit SYSCALL instruction entry. 315 * 32bit SYSCALL instruction entry.
274 * 316 *
317 * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
318 * then loads new ss, cs, and rip from previously programmed MSRs.
319 * rflags gets masked by a value from another MSR (so CLD and CLAC
320 * are not needed). SYSCALL does not save anything on the stack
321 * and does not change rsp.
322 *
323 * Note: rflags saving+masking-with-MSR happens only in Long mode
324 * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
325 * Don't get confused: rflags saving+masking depends on Long Mode Active bit
326 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
327 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
328 *
275 * Arguments: 329 * Arguments:
276 * %eax System call number. 330 * eax system call number
277 * %ebx Arg1 331 * ecx return address
278 * %ecx return EIP 332 * ebx arg1
279 * %edx Arg3 333 * ebp arg2 (note: not saved in the stack frame, should not be touched)
280 * %esi Arg4 334 * edx arg3
281 * %edi Arg5 335 * esi arg4
282 * %ebp Arg2 [note: not saved in the stack frame, should not be touched] 336 * edi arg5
283 * %esp user stack 337 * esp user stack
284 * 0(%esp) Arg6 338 * 0(%esp) arg6
285 * 339 *
286 * Interrupts off.
287 *
288 * This is purely a fast path. For anything complicated we use the int 0x80 340 * This is purely a fast path. For anything complicated we use the int 0x80
289 * path below. Set up a complete hardware stack frame to share code 341 * path below. We set up a complete hardware stack frame to share code
290 * with the int 0x80 path. 342 * with the int 0x80 path.
291 */ 343 */
292ENTRY(ia32_cstar_target) 344ENTRY(ia32_cstar_target)
293 CFI_STARTPROC32 simple 345 CFI_STARTPROC32 simple
294 CFI_SIGNAL_FRAME 346 CFI_SIGNAL_FRAME
295 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 347 CFI_DEF_CFA rsp,0
296 CFI_REGISTER rip,rcx 348 CFI_REGISTER rip,rcx
297 /*CFI_REGISTER rflags,r11*/ 349 /*CFI_REGISTER rflags,r11*/
350
351 /*
352 * Interrupts are off on entry.
353 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
354 * it is too small to ever cause noticeable irq latency.
355 */
298 SWAPGS_UNSAFE_STACK 356 SWAPGS_UNSAFE_STACK
299 movl %esp,%r8d 357 movl %esp,%r8d
300 CFI_REGISTER rsp,r8 358 CFI_REGISTER rsp,r8
301 movq PER_CPU_VAR(kernel_stack),%rsp 359 movq PER_CPU_VAR(kernel_stack),%rsp
302 /*
303 * No need to follow this irqs on/off section: the syscall
304 * disabled irqs and here we enable it straight after entry:
305 */
306 ENABLE_INTERRUPTS(CLBR_NONE) 360 ENABLE_INTERRUPTS(CLBR_NONE)
307 SAVE_ARGS 8,0,0 361
308 movl %eax,%eax /* zero extension */ 362 /* Zero-extending 32-bit regs, do not remove */
309 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 363 movl %eax,%eax
310 movq %rcx,RIP-ARGOFFSET(%rsp) 364
311 CFI_REL_OFFSET rip,RIP-ARGOFFSET 365 /* Construct struct pt_regs on stack */
312 movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ 366 pushq_cfi $__USER32_DS /* pt_regs->ss */
367 pushq_cfi %r8 /* pt_regs->sp */
368 CFI_REL_OFFSET rsp,0
369 pushq_cfi %r11 /* pt_regs->flags */
370 pushq_cfi $__USER32_CS /* pt_regs->cs */
371 pushq_cfi %rcx /* pt_regs->ip */
372 CFI_REL_OFFSET rip,0
373 pushq_cfi_reg rax /* pt_regs->orig_ax */
374 pushq_cfi_reg rdi /* pt_regs->di */
375 pushq_cfi_reg rsi /* pt_regs->si */
376 pushq_cfi_reg rdx /* pt_regs->dx */
377 pushq_cfi_reg rbp /* pt_regs->cx */
313 movl %ebp,%ecx 378 movl %ebp,%ecx
314 movq $__USER32_CS,CS-ARGOFFSET(%rsp) 379 pushq_cfi_reg rax /* pt_regs->ax */
315 movq $__USER32_DS,SS-ARGOFFSET(%rsp) 380 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
316 movq %r11,EFLAGS-ARGOFFSET(%rsp) 381 CFI_ADJUST_CFA_OFFSET 10*8
317 /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ 382
318 movq %r8,RSP-ARGOFFSET(%rsp) 383 /*
319 CFI_REL_OFFSET rsp,RSP-ARGOFFSET 384 * no need to do an access_ok check here because r8 has been
320 /* no need to do an access_ok check here because r8 has been 385 * 32bit zero extended
321 32bit zero extended */ 386 */
322 /* hardware stack frame is complete now */
323 ASM_STAC 387 ASM_STAC
3241: movl (%r8),%r9d 3881: movl (%r8),%r9d
325 _ASM_EXTABLE(1b,ia32_badarg) 389 _ASM_EXTABLE(1b,ia32_badarg)
326 ASM_CLAC 390 ASM_CLAC
327 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 391 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
328 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 392 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
329 CFI_REMEMBER_STATE 393 CFI_REMEMBER_STATE
330 jnz cstar_tracesys 394 jnz cstar_tracesys
331 cmpq $IA32_NR_syscalls-1,%rax 395 cmpq $IA32_NR_syscalls-1,%rax
332 ja ia32_badsys 396 ja ia32_badsys
333cstar_do_call: 397cstar_do_call:
334 IA32_ARG_FIXUP 1 398 /* 32bit syscall -> 64bit C ABI argument conversion */
399 movl %edi,%r8d /* arg5 */
400 /* r9 already loaded */ /* arg6 */
401 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
402 movl %ebx,%edi /* arg1 */
403 movl %edx,%edx /* arg3 (zero extension) */
335cstar_dispatch: 404cstar_dispatch:
336 call *ia32_sys_call_table(,%rax,8) 405 call *ia32_sys_call_table(,%rax,8)
337 movq %rax,RAX-ARGOFFSET(%rsp) 406 movq %rax,RAX(%rsp)
338 DISABLE_INTERRUPTS(CLBR_NONE) 407 DISABLE_INTERRUPTS(CLBR_NONE)
339 TRACE_IRQS_OFF 408 TRACE_IRQS_OFF
340 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 409 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
341 jnz sysretl_audit 410 jnz sysretl_audit
342sysretl_from_sys_call: 411sysretl_from_sys_call:
343 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 412 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
344 RESTORE_ARGS 0,-ARG_SKIP,0,0,0 413 RESTORE_RSI_RDI_RDX
345 movl RIP-ARGOFFSET(%rsp),%ecx 414 movl RIP(%rsp),%ecx
346 CFI_REGISTER rip,rcx 415 CFI_REGISTER rip,rcx
347 movl EFLAGS-ARGOFFSET(%rsp),%r11d 416 movl EFLAGS(%rsp),%r11d
348 /*CFI_REGISTER rflags,r11*/ 417 /*CFI_REGISTER rflags,r11*/
349 xorq %r10,%r10 418 xorq %r10,%r10
350 xorq %r9,%r9 419 xorq %r9,%r9
351 xorq %r8,%r8 420 xorq %r8,%r8
352 TRACE_IRQS_ON 421 TRACE_IRQS_ON
353 movl RSP-ARGOFFSET(%rsp),%esp 422 movl RSP(%rsp),%esp
354 CFI_RESTORE rsp 423 CFI_RESTORE rsp
424 /*
425 * 64bit->32bit SYSRET restores eip from ecx,
426 * eflags from r11 (but RF and VM bits are forced to 0),
427 * cs and ss are loaded from MSRs.
428 * (Note: 32bit->32bit SYSRET is different: since r11
429 * does not exist, it merely sets eflags.IF=1).
430 */
355 USERGS_SYSRET32 431 USERGS_SYSRET32
356 432
357#ifdef CONFIG_AUDITSYSCALL 433#ifdef CONFIG_AUDITSYSCALL
358cstar_auditsys: 434cstar_auditsys:
359 CFI_RESTORE_STATE 435 CFI_RESTORE_STATE
360 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ 436 movl %r9d,R9(%rsp) /* register to be clobbered by call */
361 auditsys_entry_common 437 auditsys_entry_common
362 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ 438 movl R9(%rsp),%r9d /* reload 6th syscall arg */
363 jmp cstar_dispatch 439 jmp cstar_dispatch
364 440
365sysretl_audit: 441sysretl_audit:
@@ -368,17 +444,17 @@ sysretl_audit:
368 444
369cstar_tracesys: 445cstar_tracesys:
370#ifdef CONFIG_AUDITSYSCALL 446#ifdef CONFIG_AUDITSYSCALL
371 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 447 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
372 jz cstar_auditsys 448 jz cstar_auditsys
373#endif 449#endif
374 xchgl %r9d,%ebp 450 xchgl %r9d,%ebp
375 SAVE_REST 451 SAVE_EXTRA_REGS
376 CLEAR_RREGS 0, r9 452 CLEAR_RREGS r9
377 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 453 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
378 movq %rsp,%rdi /* &pt_regs -> arg1 */ 454 movq %rsp,%rdi /* &pt_regs -> arg1 */
379 call syscall_trace_enter 455 call syscall_trace_enter
380 LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ 456 LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */
381 RESTORE_REST 457 RESTORE_EXTRA_REGS
382 xchgl %ebp,%r9d 458 xchgl %ebp,%r9d
383 cmpq $(IA32_NR_syscalls-1),%rax 459 cmpq $(IA32_NR_syscalls-1),%rax
384 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ 460 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -391,78 +467,94 @@ ia32_badarg:
391 jmp ia32_sysret 467 jmp ia32_sysret
392 CFI_ENDPROC 468 CFI_ENDPROC
393 469
394/* 470/*
395 * Emulated IA32 system calls via int 0x80. 471 * Emulated IA32 system calls via int 0x80.
396 * 472 *
397 * Arguments: 473 * Arguments:
398 * %eax System call number. 474 * eax system call number
399 * %ebx Arg1 475 * ebx arg1
400 * %ecx Arg2 476 * ecx arg2
401 * %edx Arg3 477 * edx arg3
402 * %esi Arg4 478 * esi arg4
403 * %edi Arg5 479 * edi arg5
404 * %ebp Arg6 [note: not saved in the stack frame, should not be touched] 480 * ebp arg6 (note: not saved in the stack frame, should not be touched)
405 * 481 *
406 * Notes: 482 * Notes:
407 * Uses the same stack frame as the x86-64 version. 483 * Uses the same stack frame as the x86-64 version.
408 * All registers except %eax must be saved (but ptrace may violate that) 484 * All registers except eax must be saved (but ptrace may violate that).
409 * Arguments are zero extended. For system calls that want sign extension and 485 * Arguments are zero extended. For system calls that want sign extension and
410 * take long arguments a wrapper is needed. Most calls can just be called 486 * take long arguments a wrapper is needed. Most calls can just be called
411 * directly. 487 * directly.
412 * Assumes it is only called from user space and entered with interrupts off. 488 * Assumes it is only called from user space and entered with interrupts off.
413 */ 489 */
414 490
415ENTRY(ia32_syscall) 491ENTRY(ia32_syscall)
416 CFI_STARTPROC32 simple 492 CFI_STARTPROC32 simple
417 CFI_SIGNAL_FRAME 493 CFI_SIGNAL_FRAME
418 CFI_DEF_CFA rsp,SS+8-RIP 494 CFI_DEF_CFA rsp,5*8
419 /*CFI_REL_OFFSET ss,SS-RIP*/ 495 /*CFI_REL_OFFSET ss,4*8 */
420 CFI_REL_OFFSET rsp,RSP-RIP 496 CFI_REL_OFFSET rsp,3*8
421 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 497 /*CFI_REL_OFFSET rflags,2*8 */
422 /*CFI_REL_OFFSET cs,CS-RIP*/ 498 /*CFI_REL_OFFSET cs,1*8 */
423 CFI_REL_OFFSET rip,RIP-RIP 499 CFI_REL_OFFSET rip,0*8
424 PARAVIRT_ADJUST_EXCEPTION_FRAME 500
425 SWAPGS
426 /* 501 /*
427 * No need to follow this irqs on/off section: the syscall 502 * Interrupts are off on entry.
428 * disabled irqs and here we enable it straight after entry: 503 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
504 * it is too small to ever cause noticeable irq latency.
429 */ 505 */
506 PARAVIRT_ADJUST_EXCEPTION_FRAME
507 SWAPGS
430 ENABLE_INTERRUPTS(CLBR_NONE) 508 ENABLE_INTERRUPTS(CLBR_NONE)
431 movl %eax,%eax 509
432 pushq_cfi %rax 510 /* Zero-extending 32-bit regs, do not remove */
511 movl %eax,%eax
512
513 /* Construct struct pt_regs on stack (iret frame is already on stack) */
514 pushq_cfi_reg rax /* pt_regs->orig_ax */
515 pushq_cfi_reg rdi /* pt_regs->di */
516 pushq_cfi_reg rsi /* pt_regs->si */
517 pushq_cfi_reg rdx /* pt_regs->dx */
518 pushq_cfi_reg rcx /* pt_regs->cx */
519 pushq_cfi_reg rax /* pt_regs->ax */
433 cld 520 cld
434 /* note the registers are not zero extended to the sf. 521 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
435 this could be a problem. */ 522 CFI_ADJUST_CFA_OFFSET 10*8
436 SAVE_ARGS 0,1,0 523
437 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 524 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
438 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 525 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
439 jnz ia32_tracesys 526 jnz ia32_tracesys
440 cmpq $(IA32_NR_syscalls-1),%rax 527 cmpq $(IA32_NR_syscalls-1),%rax
441 ja ia32_badsys 528 ja ia32_badsys
442ia32_do_call: 529ia32_do_call:
443 IA32_ARG_FIXUP 530 /* 32bit syscall -> 64bit C ABI argument conversion */
531 movl %edi,%r8d /* arg5 */
532 movl %ebp,%r9d /* arg6 */
533 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
534 movl %ebx,%edi /* arg1 */
535 movl %edx,%edx /* arg3 (zero extension) */
444 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative 536 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
445ia32_sysret: 537ia32_sysret:
446 movq %rax,RAX-ARGOFFSET(%rsp) 538 movq %rax,RAX(%rsp)
447ia32_ret_from_sys_call: 539ia32_ret_from_sys_call:
448 CLEAR_RREGS -ARGOFFSET 540 CLEAR_RREGS
449 jmp int_ret_from_sys_call 541 jmp int_ret_from_sys_call
450 542
451ia32_tracesys: 543ia32_tracesys:
452 SAVE_REST 544 SAVE_EXTRA_REGS
453 CLEAR_RREGS 545 CLEAR_RREGS
454 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 546 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
455 movq %rsp,%rdi /* &pt_regs -> arg1 */ 547 movq %rsp,%rdi /* &pt_regs -> arg1 */
456 call syscall_trace_enter 548 call syscall_trace_enter
457 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 549 LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
458 RESTORE_REST 550 RESTORE_EXTRA_REGS
459 cmpq $(IA32_NR_syscalls-1),%rax 551 cmpq $(IA32_NR_syscalls-1),%rax
460 ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ 552 ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
461 jmp ia32_do_call 553 jmp ia32_do_call
462END(ia32_syscall) 554END(ia32_syscall)
463 555
464ia32_badsys: 556ia32_badsys:
465 movq $0,ORIG_RAX-ARGOFFSET(%rsp) 557 movq $0,ORIG_RAX(%rsp)
466 movq $-ENOSYS,%rax 558 movq $-ENOSYS,%rax
467 jmp ia32_sysret 559 jmp ia32_sysret
468 560
@@ -479,8 +571,6 @@ GLOBAL(\label)
479 571
480 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn 572 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
481 PTREGSCALL stub32_sigreturn, sys32_sigreturn 573 PTREGSCALL stub32_sigreturn, sys32_sigreturn
482 PTREGSCALL stub32_execve, compat_sys_execve
483 PTREGSCALL stub32_execveat, compat_sys_execveat
484 PTREGSCALL stub32_fork, sys_fork 574 PTREGSCALL stub32_fork, sys_fork
485 PTREGSCALL stub32_vfork, sys_vfork 575 PTREGSCALL stub32_vfork, sys_vfork
486 576
@@ -492,24 +582,23 @@ GLOBAL(stub32_clone)
492 582
493 ALIGN 583 ALIGN
494ia32_ptregs_common: 584ia32_ptregs_common:
495 popq %r11
496 CFI_ENDPROC 585 CFI_ENDPROC
497 CFI_STARTPROC32 simple 586 CFI_STARTPROC32 simple
498 CFI_SIGNAL_FRAME 587 CFI_SIGNAL_FRAME
499 CFI_DEF_CFA rsp,SS+8-ARGOFFSET 588 CFI_DEF_CFA rsp,SIZEOF_PTREGS
500 CFI_REL_OFFSET rax,RAX-ARGOFFSET 589 CFI_REL_OFFSET rax,RAX
501 CFI_REL_OFFSET rcx,RCX-ARGOFFSET 590 CFI_REL_OFFSET rcx,RCX
502 CFI_REL_OFFSET rdx,RDX-ARGOFFSET 591 CFI_REL_OFFSET rdx,RDX
503 CFI_REL_OFFSET rsi,RSI-ARGOFFSET 592 CFI_REL_OFFSET rsi,RSI
504 CFI_REL_OFFSET rdi,RDI-ARGOFFSET 593 CFI_REL_OFFSET rdi,RDI
505 CFI_REL_OFFSET rip,RIP-ARGOFFSET 594 CFI_REL_OFFSET rip,RIP
506/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ 595/* CFI_REL_OFFSET cs,CS*/
507/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ 596/* CFI_REL_OFFSET rflags,EFLAGS*/
508 CFI_REL_OFFSET rsp,RSP-ARGOFFSET 597 CFI_REL_OFFSET rsp,RSP
509/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ 598/* CFI_REL_OFFSET ss,SS*/
510 SAVE_REST 599 SAVE_EXTRA_REGS 8
511 call *%rax 600 call *%rax
512 RESTORE_REST 601 RESTORE_EXTRA_REGS 8
513 jmp ia32_sysret /* misbalances the return cache */ 602 ret
514 CFI_ENDPROC 603 CFI_ENDPROC
515END(ia32_ptregs_common) 604END(ia32_ptregs_common)
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
deleted file mode 100644
index 51ecd5b4e787..000000000000
--- a/arch/x86/ia32/nosyscall.c
+++ /dev/null
@@ -1,7 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3
4long compat_ni_syscall(void)
5{
6 return -ENOSYS;
7}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 8e0ceecdc957..719cd702b0a4 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
201 advice); 201 advice);
202} 202}
203 203
204long sys32_vm86_warning(void)
205{
206 struct task_struct *me = current;
207 static char lastcomm[sizeof(me->comm)];
208
209 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
210 compat_printk(KERN_INFO
211 "%s: vm86 mode not supported on 64 bit kernel\n",
212 me->comm);
213 strncpy(lastcomm, me->comm, sizeof(lastcomm));
214 }
215 return -ENOSYS;
216}
217
218asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, 204asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
219 size_t count) 205 size_t count)
220{ 206{
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
deleted file mode 100644
index 4754ba0f5d9f..000000000000
--- a/arch/x86/ia32/syscall_ia32.c
+++ /dev/null
@@ -1,25 +0,0 @@
1/* System call table for ia32 emulation. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
13
14typedef void (*sys_call_ptr_t)(void);
15
16extern void compat_ni_syscall(void);
17
18const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c22a47..bdf02eeee765 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,63 @@
18 .endm 18 .endm
19#endif 19#endif
20 20
21.macro altinstruction_entry orig alt feature orig_len alt_len 21.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
22 .long \orig - . 22 .long \orig - .
23 .long \alt - . 23 .long \alt - .
24 .word \feature 24 .word \feature
25 .byte \orig_len 25 .byte \orig_len
26 .byte \alt_len 26 .byte \alt_len
27 .byte \pad_len
28.endm
29
30.macro ALTERNATIVE oldinstr, newinstr, feature
31140:
32 \oldinstr
33141:
34 .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
35142:
36
37 .pushsection .altinstructions,"a"
38 altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
39 .popsection
40
41 .pushsection .altinstr_replacement,"ax"
42143:
43 \newinstr
44144:
45 .popsection
46.endm
47
48#define old_len 141b-140b
49#define new_len1 144f-143f
50#define new_len2 145f-144f
51
52/*
53 * max without conditionals. Idea adapted from:
54 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
55 */
56#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
57
58.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
59140:
60 \oldinstr
61141:
62 .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
63 (alt_max_short(new_len1, new_len2) - (old_len)),0x90
64142:
65
66 .pushsection .altinstructions,"a"
67 altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
68 altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
69 .popsection
70
71 .pushsection .altinstr_replacement,"ax"
72143:
73 \newinstr1
74144:
75 \newinstr2
76145:
77 .popsection
27.endm 78.endm
28 79
29#endif /* __ASSEMBLY__ */ 80#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbee378a..ba32af062f61 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
48 s32 repl_offset; /* offset to replacement instruction */ 48 s32 repl_offset; /* offset to replacement instruction */
49 u16 cpuid; /* cpuid bit set for replacement */ 49 u16 cpuid; /* cpuid bit set for replacement */
50 u8 instrlen; /* length of original instruction */ 50 u8 instrlen; /* length of original instruction */
51 u8 replacementlen; /* length of new instruction, <= instrlen */ 51 u8 replacementlen; /* length of new instruction */
52}; 52 u8 padlen; /* length of build-time padding */
53} __packed;
53 54
54extern void alternative_instructions(void); 55extern void alternative_instructions(void);
55extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); 56extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end)
76} 77}
77#endif /* CONFIG_SMP */ 78#endif /* CONFIG_SMP */
78 79
79#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" 80#define b_replacement(num) "664"#num
81#define e_replacement(num) "665"#num
80 82
81#define b_replacement(number) "663"#number 83#define alt_end_marker "663"
82#define e_replacement(number) "664"#number 84#define alt_slen "662b-661b"
85#define alt_pad_len alt_end_marker"b-662b"
86#define alt_total_slen alt_end_marker"b-661b"
87#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
83 88
84#define alt_slen "662b-661b" 89#define __OLDINSTR(oldinstr, num) \
85#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" 90 "661:\n\t" oldinstr "\n662:\n" \
91 ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
92 "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
86 93
87#define ALTINSTR_ENTRY(feature, number) \ 94#define OLDINSTR(oldinstr, num) \
95 __OLDINSTR(oldinstr, num) \
96 alt_end_marker ":\n"
97
98/*
99 * max without conditionals. Idea adapted from:
100 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
101 *
102 * The additional "-" is needed because gas works with s32s.
103 */
104#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
105
106/*
107 * Pad the second replacement alternative with additional NOPs if it is
108 * additionally longer than the first replacement alternative.
109 */
110#define OLDINSTR_2(oldinstr, num1, num2) \
111 "661:\n\t" oldinstr "\n662:\n" \
112 ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \
113 "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \
114 alt_end_marker ":\n"
115
116#define ALTINSTR_ENTRY(feature, num) \
88 " .long 661b - .\n" /* label */ \ 117 " .long 661b - .\n" /* label */ \
89 " .long " b_replacement(number)"f - .\n" /* new instruction */ \ 118 " .long " b_replacement(num)"f - .\n" /* new instruction */ \
90 " .word " __stringify(feature) "\n" /* feature bit */ \ 119 " .word " __stringify(feature) "\n" /* feature bit */ \
91 " .byte " alt_slen "\n" /* source len */ \ 120 " .byte " alt_total_slen "\n" /* source len */ \
92 " .byte " alt_rlen(number) "\n" /* replacement len */ 121 " .byte " alt_rlen(num) "\n" /* replacement len */ \
93 122 " .byte " alt_pad_len "\n" /* pad len */
94#define DISCARD_ENTRY(number) /* rlen <= slen */ \
95 " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
96 123
97#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ 124#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
98 b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" 125 b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
99 126
100/* alternative assembly primitive: */ 127/* alternative assembly primitive: */
101#define ALTERNATIVE(oldinstr, newinstr, feature) \ 128#define ALTERNATIVE(oldinstr, newinstr, feature) \
102 OLDINSTR(oldinstr) \ 129 OLDINSTR(oldinstr, 1) \
103 ".pushsection .altinstructions,\"a\"\n" \ 130 ".pushsection .altinstructions,\"a\"\n" \
104 ALTINSTR_ENTRY(feature, 1) \ 131 ALTINSTR_ENTRY(feature, 1) \
105 ".popsection\n" \ 132 ".popsection\n" \
106 ".pushsection .discard,\"aw\",@progbits\n" \
107 DISCARD_ENTRY(1) \
108 ".popsection\n" \
109 ".pushsection .altinstr_replacement, \"ax\"\n" \ 133 ".pushsection .altinstr_replacement, \"ax\"\n" \
110 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ 134 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
111 ".popsection" 135 ".popsection"
112 136
113#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ 137#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
114 OLDINSTR(oldinstr) \ 138 OLDINSTR_2(oldinstr, 1, 2) \
115 ".pushsection .altinstructions,\"a\"\n" \ 139 ".pushsection .altinstructions,\"a\"\n" \
116 ALTINSTR_ENTRY(feature1, 1) \ 140 ALTINSTR_ENTRY(feature1, 1) \
117 ALTINSTR_ENTRY(feature2, 2) \ 141 ALTINSTR_ENTRY(feature2, 2) \
118 ".popsection\n" \ 142 ".popsection\n" \
119 ".pushsection .discard,\"aw\",@progbits\n" \
120 DISCARD_ENTRY(1) \
121 DISCARD_ENTRY(2) \
122 ".popsection\n" \
123 ".pushsection .altinstr_replacement, \"ax\"\n" \ 143 ".pushsection .altinstr_replacement, \"ax\"\n" \
124 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ 144 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
125 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ 145 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
@@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
146#define alternative(oldinstr, newinstr, feature) \ 166#define alternative(oldinstr, newinstr, feature) \
147 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") 167 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
148 168
169#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
170 asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
171
149/* 172/*
150 * Alternative inline assembly with input. 173 * Alternative inline assembly with input.
151 * 174 *
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22d896e..976b86a325e5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
91{ 91{
92 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); 92 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
93 93
94 alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, 94 alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
95 ASM_OUTPUT2("=r" (v), "=m" (*addr)), 95 ASM_OUTPUT2("=r" (v), "=m" (*addr)),
96 ASM_OUTPUT2("0" (v), "m" (*addr))); 96 ASM_OUTPUT2("0" (v), "m" (*addr)));
97} 97}
@@ -204,7 +204,6 @@ extern void clear_local_APIC(void);
204extern void disconnect_bsp_APIC(int virt_wire_setup); 204extern void disconnect_bsp_APIC(int virt_wire_setup);
205extern void disable_local_APIC(void); 205extern void disable_local_APIC(void);
206extern void lapic_shutdown(void); 206extern void lapic_shutdown(void);
207extern int verify_local_APIC(void);
208extern void sync_Arb_IDs(void); 207extern void sync_Arb_IDs(void);
209extern void init_bsp_APIC(void); 208extern void init_bsp_APIC(void);
210extern void setup_local_APIC(void); 209extern void setup_local_APIC(void);
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb33106e..959e45b81fe2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do { \
95 * Stop RDTSC speculation. This is needed when you need to use RDTSC 95 * Stop RDTSC speculation. This is needed when you need to use RDTSC
96 * (or get_cycles or vread that possibly accesses the TSC) in a defined 96 * (or get_cycles or vread that possibly accesses the TSC) in a defined
97 * code region. 97 * code region.
98 *
99 * (Could use an alternative three way for this if there was one.)
100 */ 98 */
101static __always_inline void rdtsc_barrier(void) 99static __always_inline void rdtsc_barrier(void)
102{ 100{
103 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); 101 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
104 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 102 "lfence", X86_FEATURE_LFENCE_RDTSC);
105} 103}
106 104
107#endif /* _ASM_X86_BARRIER_H */ 105#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1f1297b46f83..1c8b50edb2db 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with
55 * for assembly code: 55 * for assembly code:
56 */ 56 */
57 57
58#define R15 0 58/* The layout forms the "struct pt_regs" on the stack: */
59#define R14 8 59/*
60#define R13 16 60 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
61#define R12 24 61 * unless syscall needs a complete, fully filled "struct pt_regs".
62#define RBP 32 62 */
63#define RBX 40 63#define R15 0*8
64 64#define R14 1*8
65/* arguments: interrupts/non tracing syscalls only save up to here: */ 65#define R13 2*8
66#define R11 48 66#define R12 3*8
67#define R10 56 67#define RBP 4*8
68#define R9 64 68#define RBX 5*8
69#define R8 72 69/* These regs are callee-clobbered. Always saved on kernel entry. */
70#define RAX 80 70#define R11 6*8
71#define RCX 88 71#define R10 7*8
72#define RDX 96 72#define R9 8*8
73#define RSI 104 73#define R8 9*8
74#define RDI 112 74#define RAX 10*8
75#define ORIG_RAX 120 /* + error_code */ 75#define RCX 11*8
76/* end of arguments */ 76#define RDX 12*8
77 77#define RSI 13*8
78/* cpu exception frame or undefined in case of fast syscall: */ 78#define RDI 14*8
79#define RIP 128 79/*
80#define CS 136 80 * On syscall entry, this is syscall#. On CPU exception, this is error code.
81#define EFLAGS 144 81 * On hw interrupt, it's IRQ number:
82#define RSP 152 82 */
83#define SS 160 83#define ORIG_RAX 15*8
84 84/* Return frame for iretq */
85#define ARGOFFSET R11 85#define RIP 16*8
86 86#define CS 17*8
87 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 87#define EFLAGS 18*8
88 subq $9*8+\addskip, %rsp 88#define RSP 19*8
89 CFI_ADJUST_CFA_OFFSET 9*8+\addskip 89#define SS 20*8
90 movq_cfi rdi, 8*8 90
91 movq_cfi rsi, 7*8 91#define SIZEOF_PTREGS 21*8
92 movq_cfi rdx, 6*8 92
93 93 .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
94 .if \save_rcx 94 subq $15*8+\addskip, %rsp
95 movq_cfi rcx, 5*8 95 CFI_ADJUST_CFA_OFFSET 15*8+\addskip
96 .endif 96 .endm
97 97
98 .if \rax_enosys 98 .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
99 movq $-ENOSYS, 4*8(%rsp) 99 .if \r11
100 .else 100 movq_cfi r11, 6*8+\offset
101 movq_cfi rax, 4*8
102 .endif 101 .endif
103 102 .if \r8910
104 .if \save_r891011 103 movq_cfi r10, 7*8+\offset
105 movq_cfi r8, 3*8 104 movq_cfi r9, 8*8+\offset
106 movq_cfi r9, 2*8 105 movq_cfi r8, 9*8+\offset
107 movq_cfi r10, 1*8 106 .endif
108 movq_cfi r11, 0*8 107 .if \rax
108 movq_cfi rax, 10*8+\offset
109 .endif
110 .if \rcx
111 movq_cfi rcx, 11*8+\offset
109 .endif 112 .endif
113 movq_cfi rdx, 12*8+\offset
114 movq_cfi rsi, 13*8+\offset
115 movq_cfi rdi, 14*8+\offset
116 .endm
117 .macro SAVE_C_REGS offset=0
118 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
119 .endm
120 .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
121 SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
122 .endm
123 .macro SAVE_C_REGS_EXCEPT_R891011
124 SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
125 .endm
126 .macro SAVE_C_REGS_EXCEPT_RCX_R891011
127 SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
128 .endm
129 .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
130 SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
131 .endm
132
133 .macro SAVE_EXTRA_REGS offset=0
134 movq_cfi r15, 0*8+\offset
135 movq_cfi r14, 1*8+\offset
136 movq_cfi r13, 2*8+\offset
137 movq_cfi r12, 3*8+\offset
138 movq_cfi rbp, 4*8+\offset
139 movq_cfi rbx, 5*8+\offset
140 .endm
141 .macro SAVE_EXTRA_REGS_RBP offset=0
142 movq_cfi rbp, 4*8+\offset
143 .endm
110 144
145 .macro RESTORE_EXTRA_REGS offset=0
146 movq_cfi_restore 0*8+\offset, r15
147 movq_cfi_restore 1*8+\offset, r14
148 movq_cfi_restore 2*8+\offset, r13
149 movq_cfi_restore 3*8+\offset, r12
150 movq_cfi_restore 4*8+\offset, rbp
151 movq_cfi_restore 5*8+\offset, rbx
111 .endm 152 .endm
112 153
113#define ARG_SKIP (9*8) 154 .macro ZERO_EXTRA_REGS
155 xorl %r15d, %r15d
156 xorl %r14d, %r14d
157 xorl %r13d, %r13d
158 xorl %r12d, %r12d
159 xorl %ebp, %ebp
160 xorl %ebx, %ebx
161 .endm
114 162
115 .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ 163 .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
116 rstor_r8910=1, rstor_rdx=1
117 .if \rstor_r11 164 .if \rstor_r11
118 movq_cfi_restore 0*8, r11 165 movq_cfi_restore 6*8, r11
119 .endif 166 .endif
120
121 .if \rstor_r8910 167 .if \rstor_r8910
122 movq_cfi_restore 1*8, r10 168 movq_cfi_restore 7*8, r10
123 movq_cfi_restore 2*8, r9 169 movq_cfi_restore 8*8, r9
124 movq_cfi_restore 3*8, r8 170 movq_cfi_restore 9*8, r8
125 .endif 171 .endif
126
127 .if \rstor_rax 172 .if \rstor_rax
128 movq_cfi_restore 4*8, rax 173 movq_cfi_restore 10*8, rax
129 .endif 174 .endif
130
131 .if \rstor_rcx 175 .if \rstor_rcx
132 movq_cfi_restore 5*8, rcx 176 movq_cfi_restore 11*8, rcx
133 .endif 177 .endif
134
135 .if \rstor_rdx 178 .if \rstor_rdx
136 movq_cfi_restore 6*8, rdx 179 movq_cfi_restore 12*8, rdx
137 .endif
138
139 movq_cfi_restore 7*8, rsi
140 movq_cfi_restore 8*8, rdi
141
142 .if ARG_SKIP+\addskip > 0
143 addq $ARG_SKIP+\addskip, %rsp
144 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
145 .endif 180 .endif
181 movq_cfi_restore 13*8, rsi
182 movq_cfi_restore 14*8, rdi
146 .endm 183 .endm
147 184 .macro RESTORE_C_REGS
148 .macro LOAD_ARGS offset, skiprax=0 185 RESTORE_C_REGS_HELPER 1,1,1,1,1
149 movq \offset(%rsp), %r11
150 movq \offset+8(%rsp), %r10
151 movq \offset+16(%rsp), %r9
152 movq \offset+24(%rsp), %r8
153 movq \offset+40(%rsp), %rcx
154 movq \offset+48(%rsp), %rdx
155 movq \offset+56(%rsp), %rsi
156 movq \offset+64(%rsp), %rdi
157 .if \skiprax
158 .else
159 movq \offset+72(%rsp), %rax
160 .endif
161 .endm 186 .endm
162 187 .macro RESTORE_C_REGS_EXCEPT_RAX
163#define REST_SKIP (6*8) 188 RESTORE_C_REGS_HELPER 0,1,1,1,1
164
165 .macro SAVE_REST
166 subq $REST_SKIP, %rsp
167 CFI_ADJUST_CFA_OFFSET REST_SKIP
168 movq_cfi rbx, 5*8
169 movq_cfi rbp, 4*8
170 movq_cfi r12, 3*8
171 movq_cfi r13, 2*8
172 movq_cfi r14, 1*8
173 movq_cfi r15, 0*8
174 .endm 189 .endm
175 190 .macro RESTORE_C_REGS_EXCEPT_RCX
176 .macro RESTORE_REST 191 RESTORE_C_REGS_HELPER 1,0,1,1,1
177 movq_cfi_restore 0*8, r15
178 movq_cfi_restore 1*8, r14
179 movq_cfi_restore 2*8, r13
180 movq_cfi_restore 3*8, r12
181 movq_cfi_restore 4*8, rbp
182 movq_cfi_restore 5*8, rbx
183 addq $REST_SKIP, %rsp
184 CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
185 .endm 192 .endm
186 193 .macro RESTORE_C_REGS_EXCEPT_R11
187 .macro SAVE_ALL 194 RESTORE_C_REGS_HELPER 1,1,0,1,1
188 SAVE_ARGS 195 .endm
189 SAVE_REST 196 .macro RESTORE_C_REGS_EXCEPT_RCX_R11
197 RESTORE_C_REGS_HELPER 1,0,0,1,1
198 .endm
199 .macro RESTORE_RSI_RDI
200 RESTORE_C_REGS_HELPER 0,0,0,0,0
201 .endm
202 .macro RESTORE_RSI_RDI_RDX
203 RESTORE_C_REGS_HELPER 0,0,0,0,1
190 .endm 204 .endm
191 205
192 .macro RESTORE_ALL addskip=0 206 .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
193 RESTORE_REST 207 addq $15*8+\addskip, %rsp
194 RESTORE_ARGS 1, \addskip 208 CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
195 .endm 209 .endm
196 210
197 .macro icebp 211 .macro icebp
@@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with
210 */ 224 */
211 225
212 .macro SAVE_ALL 226 .macro SAVE_ALL
213 pushl_cfi %eax 227 pushl_cfi_reg eax
214 CFI_REL_OFFSET eax, 0 228 pushl_cfi_reg ebp
215 pushl_cfi %ebp 229 pushl_cfi_reg edi
216 CFI_REL_OFFSET ebp, 0 230 pushl_cfi_reg esi
217 pushl_cfi %edi 231 pushl_cfi_reg edx
218 CFI_REL_OFFSET edi, 0 232 pushl_cfi_reg ecx
219 pushl_cfi %esi 233 pushl_cfi_reg ebx
220 CFI_REL_OFFSET esi, 0
221 pushl_cfi %edx
222 CFI_REL_OFFSET edx, 0
223 pushl_cfi %ecx
224 CFI_REL_OFFSET ecx, 0
225 pushl_cfi %ebx
226 CFI_REL_OFFSET ebx, 0
227 .endm 234 .endm
228 235
229 .macro RESTORE_ALL 236 .macro RESTORE_ALL
230 popl_cfi %ebx 237 popl_cfi_reg ebx
231 CFI_RESTORE ebx 238 popl_cfi_reg ecx
232 popl_cfi %ecx 239 popl_cfi_reg edx
233 CFI_RESTORE ecx 240 popl_cfi_reg esi
234 popl_cfi %edx 241 popl_cfi_reg edi
235 CFI_RESTORE edx 242 popl_cfi_reg ebp
236 popl_cfi %esi 243 popl_cfi_reg eax
237 CFI_RESTORE esi
238 popl_cfi %edi
239 CFI_RESTORE edi
240 popl_cfi %ebp
241 CFI_RESTORE ebp
242 popl_cfi %eax
243 CFI_RESTORE eax
244 .endm 244 .endm
245 245
246#endif /* CONFIG_X86_64 */ 246#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c401f79f..acdee09228b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
301 sp = task_pt_regs(current)->sp; 301 sp = task_pt_regs(current)->sp;
302 } else { 302 } else {
303 /* -128 for the x32 ABI redzone */ 303 /* -128 for the x32 ABI redzone */
304 sp = this_cpu_read(old_rsp) - 128; 304 sp = task_pt_regs(current)->sp - 128;
305 } 305 }
306 306
307 return (void __user *)round_down(sp - len, 16); 307 return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
34#endif 34#endif
35#endif 35#endif
36 36
37DECLARE_PER_CPU(int, cpu_state);
38
39int mwait_usable(const struct cpuinfo_x86 *); 37int mwait_usable(const struct cpuinfo_x86 *);
40 38
41#endif /* _ASM_X86_CPU_H */ 39#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a54851aedc..7ee9b94d9921 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
12#include <asm/disabled-features.h> 12#include <asm/disabled-features.h>
13#endif 13#endif
14 14
15#define NCAPINTS 11 /* N 32-bit words worth of info */ 15#define NCAPINTS 13 /* N 32-bit words worth of info */
16#define NBUGINTS 1 /* N 32-bit bug flags */ 16#define NBUGINTS 1 /* N 32-bit bug flags */
17 17
18/* 18/*
@@ -195,6 +195,7 @@
195#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ 195#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
196#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ 196#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */
197#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ 197#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
198#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
198 199
199/* Virtualization flags: Linux defined, word 8 */ 200/* Virtualization flags: Linux defined, word 8 */
200#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ 201#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -226,12 +227,15 @@
226#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ 227#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
227#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ 228#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
228#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ 229#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
230#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
229#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ 231#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
230#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ 232#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
231#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ 233#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
232#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ 234#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
233#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ 235#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
236#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
234#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ 237#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
238#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
235#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ 239#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
236#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ 240#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
237#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ 241#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
@@ -242,6 +246,12 @@
242#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ 246#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
243#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ 247#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
244 248
249/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
250#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
251
252/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
253#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
254
245/* 255/*
246 * BUG word(s) 256 * BUG word(s)
247 */ 257 */
@@ -418,6 +428,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
418 " .word %P0\n" /* 1: do replace */ 428 " .word %P0\n" /* 1: do replace */
419 " .byte 2b - 1b\n" /* source len */ 429 " .byte 2b - 1b\n" /* source len */
420 " .byte 0\n" /* replacement len */ 430 " .byte 0\n" /* replacement len */
431 " .byte 0\n" /* pad len */
421 ".previous\n" 432 ".previous\n"
422 /* skipping size check since replacement size = 0 */ 433 /* skipping size check since replacement size = 0 */
423 : : "i" (X86_FEATURE_ALWAYS) : : t_warn); 434 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -432,6 +443,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
432 " .word %P0\n" /* feature bit */ 443 " .word %P0\n" /* feature bit */
433 " .byte 2b - 1b\n" /* source len */ 444 " .byte 2b - 1b\n" /* source len */
434 " .byte 0\n" /* replacement len */ 445 " .byte 0\n" /* replacement len */
446 " .byte 0\n" /* pad len */
435 ".previous\n" 447 ".previous\n"
436 /* skipping size check since replacement size = 0 */ 448 /* skipping size check since replacement size = 0 */
437 : : "i" (bit) : : t_no); 449 : : "i" (bit) : : t_no);
@@ -457,6 +469,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
457 " .word %P1\n" /* feature bit */ 469 " .word %P1\n" /* feature bit */
458 " .byte 2b - 1b\n" /* source len */ 470 " .byte 2b - 1b\n" /* source len */
459 " .byte 4f - 3f\n" /* replacement len */ 471 " .byte 4f - 3f\n" /* replacement len */
472 " .byte 0\n" /* pad len */
460 ".previous\n" 473 ".previous\n"
461 ".section .discard,\"aw\",@progbits\n" 474 ".section .discard,\"aw\",@progbits\n"
462 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ 475 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -483,31 +496,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
483static __always_inline __pure bool _static_cpu_has_safe(u16 bit) 496static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
484{ 497{
485#ifdef CC_HAVE_ASM_GOTO 498#ifdef CC_HAVE_ASM_GOTO
486/* 499 asm_volatile_goto("1: jmp %l[t_dynamic]\n"
487 * We need to spell the jumps to the compiler because, depending on the offset,
488 * the replacement jump can be bigger than the original jump, and this we cannot
489 * have. Thus, we force the jump to the widest, 4-byte, signed relative
490 * offset even though the last would often fit in less bytes.
491 */
492 asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
493 "2:\n" 500 "2:\n"
501 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
502 "((5f-4f) - (2b-1b)),0x90\n"
503 "3:\n"
494 ".section .altinstructions,\"a\"\n" 504 ".section .altinstructions,\"a\"\n"
495 " .long 1b - .\n" /* src offset */ 505 " .long 1b - .\n" /* src offset */
496 " .long 3f - .\n" /* repl offset */ 506 " .long 4f - .\n" /* repl offset */
497 " .word %P1\n" /* always replace */ 507 " .word %P1\n" /* always replace */
498 " .byte 2b - 1b\n" /* src len */ 508 " .byte 3b - 1b\n" /* src len */
499 " .byte 4f - 3f\n" /* repl len */ 509 " .byte 5f - 4f\n" /* repl len */
510 " .byte 3b - 2b\n" /* pad len */
500 ".previous\n" 511 ".previous\n"
501 ".section .altinstr_replacement,\"ax\"\n" 512 ".section .altinstr_replacement,\"ax\"\n"
502 "3: .byte 0xe9\n .long %l[t_no] - 2b\n" 513 "4: jmp %l[t_no]\n"
503 "4:\n" 514 "5:\n"
504 ".previous\n" 515 ".previous\n"
505 ".section .altinstructions,\"a\"\n" 516 ".section .altinstructions,\"a\"\n"
506 " .long 1b - .\n" /* src offset */ 517 " .long 1b - .\n" /* src offset */
507 " .long 0\n" /* no replacement */ 518 " .long 0\n" /* no replacement */
508 " .word %P0\n" /* feature bit */ 519 " .word %P0\n" /* feature bit */
509 " .byte 2b - 1b\n" /* src len */ 520 " .byte 3b - 1b\n" /* src len */
510 " .byte 0\n" /* repl len */ 521 " .byte 0\n" /* repl len */
522 " .byte 0\n" /* pad len */
511 ".previous\n" 523 ".previous\n"
512 : : "i" (bit), "i" (X86_FEATURE_ALWAYS) 524 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
513 : : t_dynamic, t_no); 525 : : t_dynamic, t_no);
@@ -527,6 +539,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
527 " .word %P2\n" /* always replace */ 539 " .word %P2\n" /* always replace */
528 " .byte 2b - 1b\n" /* source len */ 540 " .byte 2b - 1b\n" /* source len */
529 " .byte 4f - 3f\n" /* replacement len */ 541 " .byte 4f - 3f\n" /* replacement len */
542 " .byte 0\n" /* pad len */
530 ".previous\n" 543 ".previous\n"
531 ".section .discard,\"aw\",@progbits\n" 544 ".section .discard,\"aw\",@progbits\n"
532 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ 545 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -541,6 +554,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
541 " .word %P1\n" /* feature bit */ 554 " .word %P1\n" /* feature bit */
542 " .byte 4b - 3b\n" /* src len */ 555 " .byte 4b - 3b\n" /* src len */
543 " .byte 6f - 5f\n" /* repl len */ 556 " .byte 6f - 5f\n" /* repl len */
557 " .byte 0\n" /* pad len */
544 ".previous\n" 558 ".previous\n"
545 ".section .discard,\"aw\",@progbits\n" 559 ".section .discard,\"aw\",@progbits\n"
546 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ 560 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a94b82e8f156..a0bf89fd2647 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
376 * Pentium F0 0F bugfix can have resulted in the mapped 376 * Pentium F0 0F bugfix can have resulted in the mapped
377 * IDT being write-protected. 377 * IDT being write-protected.
378 */ 378 */
379#define set_intr_gate(n, addr) \ 379#define set_intr_gate_notrace(n, addr) \
380 do { \ 380 do { \
381 BUG_ON((unsigned)n > 0xFF); \ 381 BUG_ON((unsigned)n > 0xFF); \
382 _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ 382 _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \
383 __KERNEL_CS); \ 383 __KERNEL_CS); \
384 } while (0)
385
386#define set_intr_gate(n, addr) \
387 do { \
388 set_intr_gate_notrace(n, addr); \
384 _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ 389 _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
385 0, 0, __KERNEL_CS); \ 390 0, 0, __KERNEL_CS); \
386 } while (0) 391 } while (0)
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index f6f15986df6c..de1cdaf4d743 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -86,11 +86,23 @@
86 CFI_ADJUST_CFA_OFFSET 8 86 CFI_ADJUST_CFA_OFFSET 8
87 .endm 87 .endm
88 88
89 .macro pushq_cfi_reg reg
90 pushq %\reg
91 CFI_ADJUST_CFA_OFFSET 8
92 CFI_REL_OFFSET \reg, 0
93 .endm
94
89 .macro popq_cfi reg 95 .macro popq_cfi reg
90 popq \reg 96 popq \reg
91 CFI_ADJUST_CFA_OFFSET -8 97 CFI_ADJUST_CFA_OFFSET -8
92 .endm 98 .endm
93 99
100 .macro popq_cfi_reg reg
101 popq %\reg
102 CFI_ADJUST_CFA_OFFSET -8
103 CFI_RESTORE \reg
104 .endm
105
94 .macro pushfq_cfi 106 .macro pushfq_cfi
95 pushfq 107 pushfq
96 CFI_ADJUST_CFA_OFFSET 8 108 CFI_ADJUST_CFA_OFFSET 8
@@ -116,11 +128,23 @@
116 CFI_ADJUST_CFA_OFFSET 4 128 CFI_ADJUST_CFA_OFFSET 4
117 .endm 129 .endm
118 130
131 .macro pushl_cfi_reg reg
132 pushl %\reg
133 CFI_ADJUST_CFA_OFFSET 4
134 CFI_REL_OFFSET \reg, 0
135 .endm
136
119 .macro popl_cfi reg 137 .macro popl_cfi reg
120 popl \reg 138 popl \reg
121 CFI_ADJUST_CFA_OFFSET -4 139 CFI_ADJUST_CFA_OFFSET -4
122 .endm 140 .endm
123 141
142 .macro popl_cfi_reg reg
143 popl %\reg
144 CFI_ADJUST_CFA_OFFSET -4
145 CFI_RESTORE \reg
146 .endm
147
124 .macro pushfl_cfi 148 .macro pushfl_cfi
125 pushfl 149 pushfl
126 CFI_ADJUST_CFA_OFFSET 4 150 CFI_ADJUST_CFA_OFFSET 4
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
40} 40}
41#endif 41#endif
42 42
43#ifdef CONFIG_MEMTEST
44extern void early_memtest(unsigned long start, unsigned long end);
45#else
46static inline void early_memtest(unsigned long start, unsigned long end)
47{
48}
49#endif
50
51extern unsigned long e820_end_of_ram_pfn(void); 43extern unsigned long e820_end_of_ram_pfn(void);
52extern unsigned long e820_end_of_low_ram_pfn(void); 44extern unsigned long e820_end_of_low_ram_pfn(void);
53extern u64 early_reserve_e820(u64 sizet, u64 align); 45extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 25bce45c6fc4..3738b138b843 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -2,6 +2,8 @@
2#define _ASM_X86_EFI_H 2#define _ASM_X86_EFI_H
3 3
4#include <asm/i387.h> 4#include <asm/i387.h>
5#include <asm/pgtable.h>
6
5/* 7/*
6 * We map the EFI regions needed for runtime services non-contiguously, 8 * We map the EFI regions needed for runtime services non-contiguously,
7 * with preserved alignment on virtual addresses starting from -4G down 9 * with preserved alignment on virtual addresses starting from -4G down
@@ -89,8 +91,8 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
89extern struct efi_scratch efi_scratch; 91extern struct efi_scratch efi_scratch;
90extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); 92extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
91extern int __init efi_memblock_x86_reserve_range(void); 93extern int __init efi_memblock_x86_reserve_range(void);
92extern void __init efi_call_phys_prolog(void); 94extern pgd_t * __init efi_call_phys_prolog(void);
93extern void __init efi_call_phys_epilog(void); 95extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
94extern void __init efi_unmap_memmap(void); 96extern void __init efi_unmap_memmap(void);
95extern void __init efi_memory_uc(u64 addr, unsigned long size); 97extern void __init efi_memory_uc(u64 addr, unsigned long size);
96extern void __init efi_map_region(efi_memory_desc_t *md); 98extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ca3347a9dab5..f161c189c27b 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -171,10 +171,11 @@ do { \
171static inline void elf_common_init(struct thread_struct *t, 171static inline void elf_common_init(struct thread_struct *t,
172 struct pt_regs *regs, const u16 ds) 172 struct pt_regs *regs, const u16 ds)
173{ 173{
174 regs->ax = regs->bx = regs->cx = regs->dx = 0; 174 /* Commented-out registers are cleared in stub_execve */
175 regs->si = regs->di = regs->bp = 0; 175 /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0;
176 regs->si = regs->di /*= regs->bp*/ = 0;
176 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; 177 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
177 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 178 /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/
178 t->fs = t->gs = 0; 179 t->fs = t->gs = 0;
179 t->fsindex = t->gsindex = 0; 180 t->fsindex = t->gsindex = 0;
180 t->ds = t->es = ds; 181 t->ds = t->es = ds;
@@ -338,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
338 int uses_interp); 339 int uses_interp);
339#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages 340#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
340 341
341extern unsigned long arch_randomize_brk(struct mm_struct *mm);
342#define arch_randomize_brk arch_randomize_brk
343
344/* 342/*
345 * True on X86_32 or when emulating IA32 on X86_64 343 * True on X86_32 or when emulating IA32 on X86_64
346 */ 344 */
@@ -365,6 +363,7 @@ enum align_flags {
365struct va_alignment { 363struct va_alignment {
366 int flags; 364 int flags;
367 unsigned long mask; 365 unsigned long mask;
366 unsigned long bits;
368} ____cacheline_aligned; 367} ____cacheline_aligned;
369 368
370extern struct va_alignment va_align; 369extern struct va_alignment va_align;
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 72ba21a8b5fc..da5e96756570 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -67,6 +67,34 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
67static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} 67static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
68#endif 68#endif
69 69
70/*
71 * Must be run with preemption disabled: this clears the fpu_owner_task,
72 * on this CPU.
73 *
74 * This will disable any lazy FPU state restore of the current FPU state,
75 * but if the current thread owns the FPU, it will still be saved by.
76 */
77static inline void __cpu_disable_lazy_restore(unsigned int cpu)
78{
79 per_cpu(fpu_owner_task, cpu) = NULL;
80}
81
82/*
83 * Used to indicate that the FPU state in memory is newer than the FPU
84 * state in registers, and the FPU state should be reloaded next time the
85 * task is run. Only safe on the current task, or non-running tasks.
86 */
87static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
88{
89 tsk->thread.fpu.last_cpu = ~0;
90}
91
92static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
93{
94 return new == this_cpu_read_stable(fpu_owner_task) &&
95 cpu == new->thread.fpu.last_cpu;
96}
97
70static inline int is_ia32_compat_frame(void) 98static inline int is_ia32_compat_frame(void)
71{ 99{
72 return config_enabled(CONFIG_IA32_EMULATION) && 100 return config_enabled(CONFIG_IA32_EMULATION) &&
@@ -107,7 +135,6 @@ static __always_inline __pure bool use_fxsr(void)
107 135
108static inline void fx_finit(struct i387_fxsave_struct *fx) 136static inline void fx_finit(struct i387_fxsave_struct *fx)
109{ 137{
110 memset(fx, 0, xstate_size);
111 fx->cwd = 0x37f; 138 fx->cwd = 0x37f;
112 fx->mxcsr = MXCSR_DEFAULT; 139 fx->mxcsr = MXCSR_DEFAULT;
113} 140}
@@ -351,8 +378,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk)
351 __thread_set_has_fpu(tsk); 378 __thread_set_has_fpu(tsk);
352} 379}
353 380
354static inline void __drop_fpu(struct task_struct *tsk) 381static inline void drop_fpu(struct task_struct *tsk)
355{ 382{
383 /*
384 * Forget coprocessor state..
385 */
386 preempt_disable();
387 tsk->thread.fpu_counter = 0;
388
356 if (__thread_has_fpu(tsk)) { 389 if (__thread_has_fpu(tsk)) {
357 /* Ignore delayed exceptions from user space */ 390 /* Ignore delayed exceptions from user space */
358 asm volatile("1: fwait\n" 391 asm volatile("1: fwait\n"
@@ -360,30 +393,29 @@ static inline void __drop_fpu(struct task_struct *tsk)
360 _ASM_EXTABLE(1b, 2b)); 393 _ASM_EXTABLE(1b, 2b));
361 __thread_fpu_end(tsk); 394 __thread_fpu_end(tsk);
362 } 395 }
363}
364 396
365static inline void drop_fpu(struct task_struct *tsk)
366{
367 /*
368 * Forget coprocessor state..
369 */
370 preempt_disable();
371 tsk->thread.fpu_counter = 0;
372 __drop_fpu(tsk);
373 clear_stopped_child_used_math(tsk); 397 clear_stopped_child_used_math(tsk);
374 preempt_enable(); 398 preempt_enable();
375} 399}
376 400
377static inline void drop_init_fpu(struct task_struct *tsk) 401static inline void restore_init_xstate(void)
402{
403 if (use_xsave())
404 xrstor_state(init_xstate_buf, -1);
405 else
406 fxrstor_checking(&init_xstate_buf->i387);
407}
408
409/*
410 * Reset the FPU state in the eager case and drop it in the lazy case (later use
411 * will reinit it).
412 */
413static inline void fpu_reset_state(struct task_struct *tsk)
378{ 414{
379 if (!use_eager_fpu()) 415 if (!use_eager_fpu())
380 drop_fpu(tsk); 416 drop_fpu(tsk);
381 else { 417 else
382 if (use_xsave()) 418 restore_init_xstate();
383 xrstor_state(init_xstate_buf, -1);
384 else
385 fxrstor_checking(&init_xstate_buf->i387);
386 }
387} 419}
388 420
389/* 421/*
@@ -400,24 +432,6 @@ static inline void drop_init_fpu(struct task_struct *tsk)
400 */ 432 */
401typedef struct { int preload; } fpu_switch_t; 433typedef struct { int preload; } fpu_switch_t;
402 434
403/*
404 * Must be run with preemption disabled: this clears the fpu_owner_task,
405 * on this CPU.
406 *
407 * This will disable any lazy FPU state restore of the current FPU state,
408 * but if the current thread owns the FPU, it will still be saved by.
409 */
410static inline void __cpu_disable_lazy_restore(unsigned int cpu)
411{
412 per_cpu(fpu_owner_task, cpu) = NULL;
413}
414
415static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
416{
417 return new == this_cpu_read_stable(fpu_owner_task) &&
418 cpu == new->thread.fpu.last_cpu;
419}
420
421static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) 435static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
422{ 436{
423 fpu_switch_t fpu; 437 fpu_switch_t fpu;
@@ -426,13 +440,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
426 * If the task has used the math, pre-load the FPU on xsave processors 440 * If the task has used the math, pre-load the FPU on xsave processors
427 * or if the past 5 consecutive context-switches used math. 441 * or if the past 5 consecutive context-switches used math.
428 */ 442 */
429 fpu.preload = tsk_used_math(new) && (use_eager_fpu() || 443 fpu.preload = tsk_used_math(new) &&
430 new->thread.fpu_counter > 5); 444 (use_eager_fpu() || new->thread.fpu_counter > 5);
445
431 if (__thread_has_fpu(old)) { 446 if (__thread_has_fpu(old)) {
432 if (!__save_init_fpu(old)) 447 if (!__save_init_fpu(old))
433 cpu = ~0; 448 task_disable_lazy_fpu_restore(old);
434 old->thread.fpu.last_cpu = cpu; 449 else
435 old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ 450 old->thread.fpu.last_cpu = cpu;
451
452 /* But leave fpu_owner_task! */
453 old->thread.fpu.has_fpu = 0;
436 454
437 /* Don't change CR0.TS if we just switch! */ 455 /* Don't change CR0.TS if we just switch! */
438 if (fpu.preload) { 456 if (fpu.preload) {
@@ -443,10 +461,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
443 stts(); 461 stts();
444 } else { 462 } else {
445 old->thread.fpu_counter = 0; 463 old->thread.fpu_counter = 0;
446 old->thread.fpu.last_cpu = ~0; 464 task_disable_lazy_fpu_restore(old);
447 if (fpu.preload) { 465 if (fpu.preload) {
448 new->thread.fpu_counter++; 466 new->thread.fpu_counter++;
449 if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) 467 if (fpu_lazy_restore(new, cpu))
450 fpu.preload = 0; 468 fpu.preload = 0;
451 else 469 else
452 prefetch(new->thread.fpu.state); 470 prefetch(new->thread.fpu.state);
@@ -466,7 +484,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
466{ 484{
467 if (fpu.preload) { 485 if (fpu.preload) {
468 if (unlikely(restore_fpu_checking(new))) 486 if (unlikely(restore_fpu_checking(new)))
469 drop_init_fpu(new); 487 fpu_reset_state(new);
470 } 488 }
471} 489}
472 490
@@ -495,10 +513,12 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
495} 513}
496 514
497/* 515/*
498 * Need to be preemption-safe. 516 * Needs to be preemption-safe.
499 * 517 *
500 * NOTE! user_fpu_begin() must be used only immediately before restoring 518 * NOTE! user_fpu_begin() must be used only immediately before restoring
501 * it. This function does not do any save/restore on their own. 519 * the save state. It does not do any saving/restoring on its own. In
520 * lazy FPU mode, it is just an optimization to avoid a #NM exception,
521 * the task can lose the FPU right after preempt_enable().
502 */ 522 */
503static inline void user_fpu_begin(void) 523static inline void user_fpu_begin(void)
504{ 524{
@@ -520,24 +540,6 @@ static inline void __save_fpu(struct task_struct *tsk)
520} 540}
521 541
522/* 542/*
523 * These disable preemption on their own and are safe
524 */
525static inline void save_init_fpu(struct task_struct *tsk)
526{
527 WARN_ON_ONCE(!__thread_has_fpu(tsk));
528
529 if (use_eager_fpu()) {
530 __save_fpu(tsk);
531 return;
532 }
533
534 preempt_disable();
535 __save_init_fpu(tsk);
536 __thread_fpu_end(tsk);
537 preempt_enable();
538}
539
540/*
541 * i387 state interaction 543 * i387 state interaction
542 */ 544 */
543static inline unsigned short get_fpu_cwd(struct task_struct *tsk) 545static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 9662290e0b20..e9571ddabc4f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
181extern __visible void smp_invalidate_interrupt(struct pt_regs *); 181extern __visible void smp_invalidate_interrupt(struct pt_regs *);
182#endif 182#endif
183 183
184extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR 184extern char irq_entries_start[];
185 - FIRST_EXTERNAL_VECTOR])(void);
186#ifdef CONFIG_TRACING 185#ifdef CONFIG_TRACING
187#define trace_interrupt interrupt 186#define trace_irq_entries_start irq_entries_start
188#endif 187#endif
189 188
190#define VECTOR_UNDEFINED (-1) 189#define VECTOR_UNDEFINED (-1)
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 47f29b1d1846..e7814b74caf8 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -69,7 +69,7 @@ struct insn {
69 const insn_byte_t *next_byte; 69 const insn_byte_t *next_byte;
70}; 70};
71 71
72#define MAX_INSN_SIZE 16 72#define MAX_INSN_SIZE 15
73 73
74#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) 74#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
75#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) 75#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
index f42a04735a0a..e37d6b3ad983 100644
--- a/arch/x86/include/asm/iommu_table.h
+++ b/arch/x86/include/asm/iommu_table.h
@@ -79,11 +79,12 @@ struct iommu_table_entry {
79 * d). Similar to the 'init', except that this gets called from pci_iommu_init 79 * d). Similar to the 'init', except that this gets called from pci_iommu_init
80 * where we do have a memory allocator. 80 * where we do have a memory allocator.
81 * 81 *
82 * The standard vs the _FINISH differs in that the _FINISH variant will 82 * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant
83 * continue detecting other IOMMUs in the call list after the 83 * in that the former will continue detecting other IOMMUs in the call
84 * the detection routine returns a positive number. The _FINISH will 84 * list after the detection routine returns a positive number, while the
85 * stop the execution chain. Both will still call the 'init' and 85 * latter will stop the execution chain upon first successful detection.
86 * 'late_init' functions if they are set. 86 * Both variants will still call the 'init' and 'late_init' functions if
87 * they are set.
87 */ 88 */
88#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ 89#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
89 __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) 90 __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 0a8b519226b8..b77f5edb03b0 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void)
136#define USERGS_SYSRET32 \ 136#define USERGS_SYSRET32 \
137 swapgs; \ 137 swapgs; \
138 sysretl 138 sysretl
139#define ENABLE_INTERRUPTS_SYSEXIT32 \
140 swapgs; \
141 sti; \
142 sysexit
143 139
144#else 140#else
145#define INTERRUPT_RETURN iret 141#define INTERRUPT_RETURN iret
@@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void)
163 159
164 return arch_irqs_disabled_flags(flags); 160 return arch_irqs_disabled_flags(flags);
165} 161}
162#endif /* !__ASSEMBLY__ */
166 163
164#ifdef __ASSEMBLY__
165#ifdef CONFIG_TRACE_IRQFLAGS
166# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
167# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
167#else 168#else
168 169# define TRACE_IRQS_ON
169#ifdef CONFIG_X86_64 170# define TRACE_IRQS_OFF
170#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk 171#endif
171#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ 172#ifdef CONFIG_DEBUG_LOCK_ALLOC
173# ifdef CONFIG_X86_64
174# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
175# define LOCKDEP_SYS_EXIT_IRQ \
172 TRACE_IRQS_ON; \ 176 TRACE_IRQS_ON; \
173 sti; \ 177 sti; \
174 SAVE_REST; \ 178 call lockdep_sys_exit_thunk; \
175 LOCKDEP_SYS_EXIT; \
176 RESTORE_REST; \
177 cli; \ 179 cli; \
178 TRACE_IRQS_OFF; 180 TRACE_IRQS_OFF;
179 181# else
180#else 182# define LOCKDEP_SYS_EXIT \
181#define ARCH_LOCKDEP_SYS_EXIT \
182 pushl %eax; \ 183 pushl %eax; \
183 pushl %ecx; \ 184 pushl %ecx; \
184 pushl %edx; \ 185 pushl %edx; \
@@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void)
186 popl %edx; \ 187 popl %edx; \
187 popl %ecx; \ 188 popl %ecx; \
188 popl %eax; 189 popl %eax;
189 190# define LOCKDEP_SYS_EXIT_IRQ
190#define ARCH_LOCKDEP_SYS_EXIT_IRQ 191# endif
191#endif
192
193#ifdef CONFIG_TRACE_IRQFLAGS
194# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
195# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
196#else 192#else
197# define TRACE_IRQS_ON
198# define TRACE_IRQS_OFF
199#endif
200#ifdef CONFIG_DEBUG_LOCK_ALLOC
201# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
202# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
203# else
204# define LOCKDEP_SYS_EXIT 193# define LOCKDEP_SYS_EXIT
205# define LOCKDEP_SYS_EXIT_IRQ 194# define LOCKDEP_SYS_EXIT_IRQ
206# endif 195#endif
207
208#endif /* __ASSEMBLY__ */ 196#endif /* __ASSEMBLY__ */
197
209#endif 198#endif
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 6a2cefb4395a..a4c1cf7e93f8 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_X86_JUMP_LABEL_H 1#ifndef _ASM_X86_JUMP_LABEL_H
2#define _ASM_X86_JUMP_LABEL_H 2#define _ASM_X86_JUMP_LABEL_H
3 3
4#ifdef __KERNEL__ 4#ifndef __ASSEMBLY__
5 5
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/types.h> 7#include <linux/types.h>
@@ -30,8 +30,6 @@ l_yes:
30 return true; 30 return true;
31} 31}
32 32
33#endif /* __KERNEL__ */
34
35#ifdef CONFIG_X86_64 33#ifdef CONFIG_X86_64
36typedef u64 jump_label_t; 34typedef u64 jump_label_t;
37#else 35#else
@@ -44,4 +42,5 @@ struct jump_entry {
44 jump_label_t key; 42 jump_label_t key;
45}; 43};
46 44
45#endif /* __ASSEMBLY__ */
47#endif 46#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a236e39cc385..dea2e7e962e3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -81,11 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
81 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 81 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
82} 82}
83 83
84#define SELECTOR_TI_MASK (1 << 2)
85#define SELECTOR_RPL_MASK 0x03
86
87#define IOPL_SHIFT 12
88
89#define KVM_PERMILLE_MMU_PAGES 20 84#define KVM_PERMILLE_MMU_PAGES 20
90#define KVM_MIN_ALLOC_MMU_PAGES 64 85#define KVM_MIN_ALLOC_MMU_PAGES 64
91#define KVM_MMU_HASH_SHIFT 10 86#define KVM_MMU_HASH_SHIFT 10
@@ -345,6 +340,7 @@ struct kvm_pmu {
345enum { 340enum {
346 KVM_DEBUGREG_BP_ENABLED = 1, 341 KVM_DEBUGREG_BP_ENABLED = 1,
347 KVM_DEBUGREG_WONT_EXIT = 2, 342 KVM_DEBUGREG_WONT_EXIT = 2,
343 KVM_DEBUGREG_RELOAD = 4,
348}; 344};
349 345
350struct kvm_vcpu_arch { 346struct kvm_vcpu_arch {
@@ -431,6 +427,9 @@ struct kvm_vcpu_arch {
431 427
432 int cpuid_nent; 428 int cpuid_nent;
433 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; 429 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
430
431 int maxphyaddr;
432
434 /* emulate context */ 433 /* emulate context */
435 434
436 struct x86_emulate_ctxt emulate_ctxt; 435 struct x86_emulate_ctxt emulate_ctxt;
@@ -550,11 +549,20 @@ struct kvm_arch_memory_slot {
550 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; 549 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
551}; 550};
552 551
552/*
553 * We use as the mode the number of bits allocated in the LDR for the
554 * logical processor ID. It happens that these are all powers of two.
555 * This makes it is very easy to detect cases where the APICs are
556 * configured for multiple modes; in that case, we cannot use the map and
557 * hence cannot use kvm_irq_delivery_to_apic_fast either.
558 */
559#define KVM_APIC_MODE_XAPIC_CLUSTER 4
560#define KVM_APIC_MODE_XAPIC_FLAT 8
561#define KVM_APIC_MODE_X2APIC 16
562
553struct kvm_apic_map { 563struct kvm_apic_map {
554 struct rcu_head rcu; 564 struct rcu_head rcu;
555 u8 ldr_bits; 565 u8 mode;
556 /* fields bellow are used to decode ldr values in different modes */
557 u32 cid_shift, cid_mask, lid_mask, broadcast;
558 struct kvm_lapic *phys_map[256]; 566 struct kvm_lapic *phys_map[256];
559 /* first index is cluster id second is cpu id in a cluster */ 567 /* first index is cluster id second is cpu id in a cluster */
560 struct kvm_lapic *logical_map[16][16]; 568 struct kvm_lapic *logical_map[16][16];
@@ -859,6 +867,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
859void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 867void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
860void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 868void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
861 struct kvm_memory_slot *memslot); 869 struct kvm_memory_slot *memslot);
870void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
871 struct kvm_memory_slot *memslot);
862void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 872void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
863 struct kvm_memory_slot *memslot); 873 struct kvm_memory_slot *memslot);
864void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, 874void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -933,6 +943,7 @@ struct x86_emulate_ctxt;
933int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); 943int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
934void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 944void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
935int kvm_emulate_halt(struct kvm_vcpu *vcpu); 945int kvm_emulate_halt(struct kvm_vcpu *vcpu);
946int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
936int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); 947int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
937 948
938void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 949void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -1128,7 +1139,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1128int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 1139int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
1129int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 1140int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
1130void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 1141void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
1131int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
1132int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); 1142int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
1133int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1143int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
1134int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1144int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index e62cf897f781..c1adf33fdd0d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -115,7 +115,7 @@ static inline void kvm_spinlock_init(void)
115 115
116static inline bool kvm_para_available(void) 116static inline bool kvm_para_available(void)
117{ 117{
118 return 0; 118 return false;
119} 119}
120 120
121static inline unsigned int kvm_arch_para_features(void) 121static inline unsigned int kvm_arch_para_features(void)
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index a455a53d789a..2d29197bd2fb 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -32,8 +32,8 @@ static inline int klp_check_compiler_support(void)
32#endif 32#endif
33 return 0; 33 return 0;
34} 34}
35extern int klp_write_module_reloc(struct module *mod, unsigned long type, 35int klp_write_module_reloc(struct module *mod, unsigned long type,
36 unsigned long loc, unsigned long value); 36 unsigned long loc, unsigned long value);
37 37
38static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) 38static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
39{ 39{
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9b3de99dc004..1f5a86d518db 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -116,6 +116,12 @@ struct mca_config {
116 u32 rip_msr; 116 u32 rip_msr;
117}; 117};
118 118
119struct mce_vendor_flags {
120 __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
121 __reserved_0 : 63;
122};
123extern struct mce_vendor_flags mce_flags;
124
119extern struct mca_config mca_cfg; 125extern struct mca_config mca_cfg;
120extern void mce_register_decode_chain(struct notifier_block *nb); 126extern void mce_register_decode_chain(struct notifier_block *nb);
121extern void mce_unregister_decode_chain(struct notifier_block *nb); 127extern void mce_unregister_decode_chain(struct notifier_block *nb);
@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
128#ifdef CONFIG_X86_MCE 134#ifdef CONFIG_X86_MCE
129int mcheck_init(void); 135int mcheck_init(void);
130void mcheck_cpu_init(struct cpuinfo_x86 *c); 136void mcheck_cpu_init(struct cpuinfo_x86 *c);
137void mcheck_vendor_init_severity(void);
131#else 138#else
132static inline int mcheck_init(void) { return 0; } 139static inline int mcheck_init(void) { return 0; }
133static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} 140static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
141static inline void mcheck_vendor_init_severity(void) {}
134#endif 142#endif
135 143
136#ifdef CONFIG_X86_ANCIENT_MCE 144#ifdef CONFIG_X86_ANCIENT_MCE
@@ -183,11 +191,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
183DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 191DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
184 192
185enum mcp_flags { 193enum mcp_flags {
186 MCP_TIMESTAMP = (1 << 0), /* log time stamp */ 194 MCP_TIMESTAMP = BIT(0), /* log time stamp */
187 MCP_UC = (1 << 1), /* log uncorrected errors */ 195 MCP_UC = BIT(1), /* log uncorrected errors */
188 MCP_DONTLOG = (1 << 2), /* only clear, don't log */ 196 MCP_DONTLOG = BIT(2), /* only clear, don't log */
189}; 197};
190void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 198bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
191 199
192int mce_notify_irq(void); 200int mce_notify_irq(void);
193 201
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 201b520521ed..2fb20d6f7e23 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -75,6 +75,79 @@ static inline void __exit exit_amd_microcode(void) {}
75 75
76#ifdef CONFIG_MICROCODE_EARLY 76#ifdef CONFIG_MICROCODE_EARLY
77#define MAX_UCODE_COUNT 128 77#define MAX_UCODE_COUNT 128
78
79#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
80#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
81#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
82#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
83#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
84#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
85#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
86
87#define CPUID_IS(a, b, c, ebx, ecx, edx) \
88 (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
89
90/*
91 * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
92 * x86_vendor() gets vendor id for BSP.
93 *
94 * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
95 * coding, we still use x86_vendor() to get vendor id for AP.
96 *
97 * x86_vendor() gets vendor information directly from CPUID.
98 */
99static inline int x86_vendor(void)
100{
101 u32 eax = 0x00000000;
102 u32 ebx, ecx = 0, edx;
103
104 native_cpuid(&eax, &ebx, &ecx, &edx);
105
106 if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
107 return X86_VENDOR_INTEL;
108
109 if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
110 return X86_VENDOR_AMD;
111
112 return X86_VENDOR_UNKNOWN;
113}
114
115static inline unsigned int __x86_family(unsigned int sig)
116{
117 unsigned int x86;
118
119 x86 = (sig >> 8) & 0xf;
120
121 if (x86 == 0xf)
122 x86 += (sig >> 20) & 0xff;
123
124 return x86;
125}
126
127static inline unsigned int x86_family(void)
128{
129 u32 eax = 0x00000001;
130 u32 ebx, ecx = 0, edx;
131
132 native_cpuid(&eax, &ebx, &ecx, &edx);
133
134 return __x86_family(eax);
135}
136
137static inline unsigned int x86_model(unsigned int sig)
138{
139 unsigned int x86, model;
140
141 x86 = __x86_family(sig);
142
143 model = (sig >> 4) & 0xf;
144
145 if (x86 == 0x6 || x86 == 0xf)
146 model += ((sig >> 16) & 0xf) << 4;
147
148 return model;
149}
150
78extern void __init load_ucode_bsp(void); 151extern void __init load_ucode_bsp(void);
79extern void load_ucode_ap(void); 152extern void load_ucode_ap(void);
80extern int __init save_microcode_in_initrd(void); 153extern int __init save_microcode_in_initrd(void);
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index dd4c20043ce7..2b9209c46ca9 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -56,12 +56,15 @@ struct extended_sigtable {
56 56
57#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) 57#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
58 58
59extern int 59extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
60get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev);
61extern int microcode_sanity_check(void *mc, int print_err); 60extern int microcode_sanity_check(void *mc, int print_err);
62extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev); 61extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
63extern int 62
64update_match_revision(struct microcode_header_intel *mc_header, int rev); 63static inline int
64revision_is_newer(struct microcode_header_intel *mc_header, int rev)
65{
66 return (mc_header->rev <= rev) ? 0 : 1;
67}
65 68
66#ifdef CONFIG_MICROCODE_INTEL_EARLY 69#ifdef CONFIG_MICROCODE_INTEL_EARLY
67extern void __init load_ucode_intel_bsp(void); 70extern void __init load_ucode_intel_bsp(void);
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index a1410db38a1a..653dfa7662e1 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
30 :: "a" (eax), "c" (ecx)); 30 :: "a" (eax), "c" (ecx));
31} 31}
32 32
33static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
34{
35 trace_hardirqs_on();
36 /* "mwait %eax, %ecx;" */
37 asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
38 :: "a" (eax), "c" (ecx));
39}
40
33/* 41/*
34 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 42 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
35 * which can obviate IPI to trigger checking of need_resched. 43 * which can obviate IPI to trigger checking of need_resched.
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
40 40
41#ifdef CONFIG_X86_64 41#ifdef CONFIG_X86_64
42#include <asm/page_64_types.h> 42#include <asm/page_64_types.h>
43#define IOREMAP_MAX_ORDER (PUD_SHIFT)
43#else 44#else
44#include <asm/page_32_types.h> 45#include <asm/page_32_types.h>
46#define IOREMAP_MAX_ORDER (PMD_SHIFT)
45#endif /* CONFIG_X86_64 */ 47#endif /* CONFIG_X86_64 */
46 48
47#ifndef __ASSEMBLY__ 49#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 965c47d254aa..8957810ad7d1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
545 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); 545 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
546} 546}
547 547
548#if PAGETABLE_LEVELS >= 3 548#if CONFIG_PGTABLE_LEVELS >= 3
549static inline pmd_t __pmd(pmdval_t val) 549static inline pmd_t __pmd(pmdval_t val)
550{ 550{
551 pmdval_t ret; 551 pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
585 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, 585 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
586 val); 586 val);
587} 587}
588#if PAGETABLE_LEVELS == 4 588#if CONFIG_PGTABLE_LEVELS == 4
589static inline pud_t __pud(pudval_t val) 589static inline pud_t __pud(pudval_t val)
590{ 590{
591 pudval_t ret; 591 pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
636 set_pud(pudp, __pud(0)); 636 set_pud(pudp, __pud(0));
637} 637}
638 638
639#endif /* PAGETABLE_LEVELS == 4 */ 639#endif /* CONFIG_PGTABLE_LEVELS == 4 */
640 640
641#endif /* PAGETABLE_LEVELS >= 3 */ 641#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
642 642
643#ifdef CONFIG_X86_PAE 643#ifdef CONFIG_X86_PAE
644/* Special-case pte-setting operations for PAE, which can't update a 644/* Special-case pte-setting operations for PAE, which can't update a
@@ -976,11 +976,6 @@ extern void default_banner(void);
976 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ 976 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
977 CLBR_NONE, \ 977 CLBR_NONE, \
978 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) 978 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
979
980#define ENABLE_INTERRUPTS_SYSEXIT32 \
981 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
982 CLBR_NONE, \
983 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
984#endif /* CONFIG_X86_32 */ 979#endif /* CONFIG_X86_32 */
985 980
986#endif /* __ASSEMBLY__ */ 981#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7549b8b369e4..f7b0b5c112f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,7 +294,7 @@ struct pv_mmu_ops {
294 struct paravirt_callee_save pgd_val; 294 struct paravirt_callee_save pgd_val;
295 struct paravirt_callee_save make_pgd; 295 struct paravirt_callee_save make_pgd;
296 296
297#if PAGETABLE_LEVELS >= 3 297#if CONFIG_PGTABLE_LEVELS >= 3
298#ifdef CONFIG_X86_PAE 298#ifdef CONFIG_X86_PAE
299 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); 299 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
300 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, 300 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -308,13 +308,13 @@ struct pv_mmu_ops {
308 struct paravirt_callee_save pmd_val; 308 struct paravirt_callee_save pmd_val;
309 struct paravirt_callee_save make_pmd; 309 struct paravirt_callee_save make_pmd;
310 310
311#if PAGETABLE_LEVELS == 4 311#if CONFIG_PGTABLE_LEVELS == 4
312 struct paravirt_callee_save pud_val; 312 struct paravirt_callee_save pud_val;
313 struct paravirt_callee_save make_pud; 313 struct paravirt_callee_save make_pud;
314 314
315 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); 315 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
316#endif /* PAGETABLE_LEVELS == 4 */ 316#endif /* CONFIG_PGTABLE_LEVELS == 4 */
317#endif /* PAGETABLE_LEVELS >= 3 */ 317#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
318 318
319 struct pv_lazy_ops lazy_mode; 319 struct pv_lazy_ops lazy_mode;
320 320
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
77 77
78#define pmd_pgtable(pmd) pmd_page(pmd) 78#define pmd_pgtable(pmd) pmd_page(pmd)
79 79
80#if PAGETABLE_LEVELS > 2 80#if CONFIG_PGTABLE_LEVELS > 2
81static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 81static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
82{ 82{
83 struct page *page; 83 struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
116} 116}
117#endif /* CONFIG_X86_PAE */ 117#endif /* CONFIG_X86_PAE */
118 118
119#if PAGETABLE_LEVELS > 3 119#if CONFIG_PGTABLE_LEVELS > 3
120static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) 120static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
121{ 121{
122 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); 122 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
142 ___pud_free_tlb(tlb, pud); 142 ___pud_free_tlb(tlb, pud);
143} 143}
144 144
145#endif /* PAGETABLE_LEVELS > 3 */ 145#endif /* CONFIG_PGTABLE_LEVELS > 3 */
146#endif /* PAGETABLE_LEVELS > 2 */ 146#endif /* CONFIG_PGTABLE_LEVELS > 2 */
147 147
148#endif /* _ASM_X86_PGALLOC_H */ 148#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
17#endif /* !__ASSEMBLY__ */ 17#endif /* !__ASSEMBLY__ */
18 18
19#define SHARED_KERNEL_PMD 0 19#define SHARED_KERNEL_PMD 0
20#define PAGETABLE_LEVELS 2
21 20
22/* 21/*
23 * traditional i386 two-level paging structure: 22 * traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
24#define SHARED_KERNEL_PMD 1 24#define SHARED_KERNEL_PMD 1
25#endif 25#endif
26 26
27#define PAGETABLE_LEVELS 3
28
29/* 27/*
30 * PGDIR_SHIFT determines what a top-level page table entry can map 28 * PGDIR_SHIFT determines what a top-level page table entry can map
31 */ 29 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..fe57e7a98839 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
551 return npg >> (20 - PAGE_SHIFT); 551 return npg >> (20 - PAGE_SHIFT);
552} 552}
553 553
554#if PAGETABLE_LEVELS > 2 554#if CONFIG_PGTABLE_LEVELS > 2
555static inline int pud_none(pud_t pud) 555static inline int pud_none(pud_t pud)
556{ 556{
557 return native_pud_val(pud) == 0; 557 return native_pud_val(pud) == 0;
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud)
594{ 594{
595 return 0; 595 return 0;
596} 596}
597#endif /* PAGETABLE_LEVELS > 2 */ 597#endif /* CONFIG_PGTABLE_LEVELS > 2 */
598 598
599#if PAGETABLE_LEVELS > 3 599#if CONFIG_PGTABLE_LEVELS > 3
600static inline int pgd_present(pgd_t pgd) 600static inline int pgd_present(pgd_t pgd)
601{ 601{
602 return pgd_flags(pgd) & _PAGE_PRESENT; 602 return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd)
633{ 633{
634 return !native_pgd_val(pgd); 634 return !native_pgd_val(pgd);
635} 635}
636#endif /* PAGETABLE_LEVELS > 3 */ 636#endif /* CONFIG_PGTABLE_LEVELS > 3 */
637 637
638#endif /* __ASSEMBLY__ */ 638#endif /* __ASSEMBLY__ */
639 639
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
20#endif /* !__ASSEMBLY__ */ 20#endif /* !__ASSEMBLY__ */
21 21
22#define SHARED_KERNEL_PMD 0 22#define SHARED_KERNEL_PMD 0
23#define PAGETABLE_LEVELS 4
24 23
25/* 24/*
26 * PGDIR_SHIFT determines what a top-level page table entry can map 25 * PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
234 return native_pgd_val(pgd) & PTE_FLAGS_MASK; 234 return native_pgd_val(pgd) & PTE_FLAGS_MASK;
235} 235}
236 236
237#if PAGETABLE_LEVELS > 3 237#if CONFIG_PGTABLE_LEVELS > 3
238typedef struct { pudval_t pud; } pud_t; 238typedef struct { pudval_t pud; } pud_t;
239 239
240static inline pud_t native_make_pud(pmdval_t val) 240static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
255} 255}
256#endif 256#endif
257 257
258#if PAGETABLE_LEVELS > 2 258#if CONFIG_PGTABLE_LEVELS > 2
259typedef struct { pmdval_t pmd; } pmd_t; 259typedef struct { pmdval_t pmd; } pmd_t;
260 260
261static inline pmd_t native_make_pmd(pmdval_t val) 261static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/pm-trace.h
index 3ff1c2cb1da5..7b7ac42c3661 100644
--- a/arch/x86/include/asm/resume-trace.h
+++ b/arch/x86/include/asm/pm-trace.h
@@ -1,5 +1,5 @@
1#ifndef _ASM_X86_RESUME_TRACE_H 1#ifndef _ASM_X86_PM_TRACE_H
2#define _ASM_X86_RESUME_TRACE_H 2#define _ASM_X86_PM_TRACE_H
3 3
4#include <asm/asm.h> 4#include <asm/asm.h>
5 5
@@ -14,8 +14,10 @@ do { \
14 ".previous" \ 14 ".previous" \
15 :"=r" (tracedata) \ 15 :"=r" (tracedata) \
16 : "i" (__LINE__), "i" (__FILE__)); \ 16 : "i" (__LINE__), "i" (__FILE__)); \
17 generate_resume_trace(tracedata, user); \ 17 generate_pm_trace(tracedata, user); \
18 } \ 18 } \
19} while (0) 19} while (0)
20 20
21#endif /* _ASM_X86_RESUME_TRACE_H */ 21#define TRACE_SUSPEND(user) TRACE_RESUME(user)
22
23#endif /* _ASM_X86_PM_TRACE_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..23ba6765b718 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -109,6 +109,9 @@ struct cpuinfo_x86 {
109 /* in KB - valid for CPUS which support this call: */ 109 /* in KB - valid for CPUS which support this call: */
110 int x86_cache_size; 110 int x86_cache_size;
111 int x86_cache_alignment; /* In bytes */ 111 int x86_cache_alignment; /* In bytes */
112 /* Cache QoS architectural values: */
113 int x86_cache_max_rmid; /* max index */
114 int x86_cache_occ_scale; /* scale to bytes */
112 int x86_power; 115 int x86_power;
113 unsigned long loops_per_jiffy; 116 unsigned long loops_per_jiffy;
114 /* cpuid returned max cores value: */ 117 /* cpuid returned max cores value: */
@@ -210,8 +213,23 @@ struct x86_hw_tss {
210 unsigned long sp0; 213 unsigned long sp0;
211 unsigned short ss0, __ss0h; 214 unsigned short ss0, __ss0h;
212 unsigned long sp1; 215 unsigned long sp1;
213 /* ss1 caches MSR_IA32_SYSENTER_CS: */ 216
214 unsigned short ss1, __ss1h; 217 /*
218 * We don't use ring 1, so ss1 is a convenient scratch space in
219 * the same cacheline as sp0. We use ss1 to cache the value in
220 * MSR_IA32_SYSENTER_CS. When we context switch
221 * MSR_IA32_SYSENTER_CS, we first check if the new value being
222 * written matches ss1, and, if it's not, then we wrmsr the new
223 * value and update ss1.
224 *
225 * The only reason we context switch MSR_IA32_SYSENTER_CS is
226 * that we set it to zero in vm86 tasks to avoid corrupting the
227 * stack if we were to go through the sysenter path from vm86
228 * mode.
229 */
230 unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
231
232 unsigned short __ss1h;
215 unsigned long sp2; 233 unsigned long sp2;
216 unsigned short ss2, __ss2h; 234 unsigned short ss2, __ss2h;
217 unsigned long __cr3; 235 unsigned long __cr3;
@@ -276,13 +294,17 @@ struct tss_struct {
276 unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; 294 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
277 295
278 /* 296 /*
279 * .. and then another 0x100 bytes for the emergency kernel stack: 297 * Space for the temporary SYSENTER stack:
280 */ 298 */
281 unsigned long stack[64]; 299 unsigned long SYSENTER_stack[64];
282 300
283} ____cacheline_aligned; 301} ____cacheline_aligned;
284 302
285DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); 303DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
304
305#ifdef CONFIG_X86_32
306DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
307#endif
286 308
287/* 309/*
288 * Save the original ist values for checking stack pointers during debugging 310 * Save the original ist values for checking stack pointers during debugging
@@ -474,7 +496,6 @@ struct thread_struct {
474#ifdef CONFIG_X86_32 496#ifdef CONFIG_X86_32
475 unsigned long sysenter_cs; 497 unsigned long sysenter_cs;
476#else 498#else
477 unsigned long usersp; /* Copy from PDA */
478 unsigned short es; 499 unsigned short es;
479 unsigned short ds; 500 unsigned short ds;
480 unsigned short fsindex; 501 unsigned short fsindex;
@@ -564,6 +585,16 @@ static inline void native_swapgs(void)
564#endif 585#endif
565} 586}
566 587
588static inline unsigned long current_top_of_stack(void)
589{
590#ifdef CONFIG_X86_64
591 return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
592#else
593 /* sp0 on x86_32 is special in and around vm86 mode. */
594 return this_cpu_read_stable(cpu_current_top_of_stack);
595#endif
596}
597
567#ifdef CONFIG_PARAVIRT 598#ifdef CONFIG_PARAVIRT
568#include <asm/paravirt.h> 599#include <asm/paravirt.h>
569#else 600#else
@@ -761,10 +792,10 @@ extern char ignore_fpu_irq;
761#define ARCH_HAS_SPINLOCK_PREFETCH 792#define ARCH_HAS_SPINLOCK_PREFETCH
762 793
763#ifdef CONFIG_X86_32 794#ifdef CONFIG_X86_32
764# define BASE_PREFETCH ASM_NOP4 795# define BASE_PREFETCH ""
765# define ARCH_HAS_PREFETCH 796# define ARCH_HAS_PREFETCH
766#else 797#else
767# define BASE_PREFETCH "prefetcht0 (%1)" 798# define BASE_PREFETCH "prefetcht0 %P1"
768#endif 799#endif
769 800
770/* 801/*
@@ -775,10 +806,9 @@ extern char ignore_fpu_irq;
775 */ 806 */
776static inline void prefetch(const void *x) 807static inline void prefetch(const void *x)
777{ 808{
778 alternative_input(BASE_PREFETCH, 809 alternative_input(BASE_PREFETCH, "prefetchnta %P1",
779 "prefetchnta (%1)",
780 X86_FEATURE_XMM, 810 X86_FEATURE_XMM,
781 "r" (x)); 811 "m" (*(const char *)x));
782} 812}
783 813
784/* 814/*
@@ -788,10 +818,9 @@ static inline void prefetch(const void *x)
788 */ 818 */
789static inline void prefetchw(const void *x) 819static inline void prefetchw(const void *x)
790{ 820{
791 alternative_input(BASE_PREFETCH, 821 alternative_input(BASE_PREFETCH, "prefetchw %P1",
792 "prefetchw (%1)", 822 X86_FEATURE_3DNOWPREFETCH,
793 X86_FEATURE_3DNOW, 823 "m" (*(const char *)x));
794 "r" (x));
795} 824}
796 825
797static inline void spin_lock_prefetch(const void *x) 826static inline void spin_lock_prefetch(const void *x)
@@ -799,6 +828,9 @@ static inline void spin_lock_prefetch(const void *x)
799 prefetchw(x); 828 prefetchw(x);
800} 829}
801 830
831#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
832 TOP_OF_KERNEL_STACK_PADDING)
833
802#ifdef CONFIG_X86_32 834#ifdef CONFIG_X86_32
803/* 835/*
804 * User space process size: 3GB (default). 836 * User space process size: 3GB (default).
@@ -809,39 +841,16 @@ static inline void spin_lock_prefetch(const void *x)
809#define STACK_TOP_MAX STACK_TOP 841#define STACK_TOP_MAX STACK_TOP
810 842
811#define INIT_THREAD { \ 843#define INIT_THREAD { \
812 .sp0 = sizeof(init_stack) + (long)&init_stack, \ 844 .sp0 = TOP_OF_INIT_STACK, \
813 .vm86_info = NULL, \ 845 .vm86_info = NULL, \
814 .sysenter_cs = __KERNEL_CS, \ 846 .sysenter_cs = __KERNEL_CS, \
815 .io_bitmap_ptr = NULL, \ 847 .io_bitmap_ptr = NULL, \
816} 848}
817 849
818/*
819 * Note that the .io_bitmap member must be extra-big. This is because
820 * the CPU will access an additional byte beyond the end of the IO
821 * permission bitmap. The extra byte must be all 1 bits, and must
822 * be within the limit.
823 */
824#define INIT_TSS { \
825 .x86_tss = { \
826 .sp0 = sizeof(init_stack) + (long)&init_stack, \
827 .ss0 = __KERNEL_DS, \
828 .ss1 = __KERNEL_CS, \
829 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
830 }, \
831 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
832}
833
834extern unsigned long thread_saved_pc(struct task_struct *tsk); 850extern unsigned long thread_saved_pc(struct task_struct *tsk);
835 851
836#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
837#define KSTK_TOP(info) \
838({ \
839 unsigned long *__ptr = (unsigned long *)(info); \
840 (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
841})
842
843/* 852/*
844 * The below -8 is to reserve 8 bytes on top of the ring0 stack. 853 * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
845 * This is necessary to guarantee that the entire "struct pt_regs" 854 * This is necessary to guarantee that the entire "struct pt_regs"
846 * is accessible even if the CPU haven't stored the SS/ESP registers 855 * is accessible even if the CPU haven't stored the SS/ESP registers
847 * on the stack (interrupt gate does not save these registers 856 * on the stack (interrupt gate does not save these registers
@@ -850,11 +859,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
850 * "struct pt_regs" is possible, but they may contain the 859 * "struct pt_regs" is possible, but they may contain the
851 * completely wrong values. 860 * completely wrong values.
852 */ 861 */
853#define task_pt_regs(task) \ 862#define task_pt_regs(task) \
854({ \ 863({ \
855 struct pt_regs *__regs__; \ 864 unsigned long __ptr = (unsigned long)task_stack_page(task); \
856 __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ 865 __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
857 __regs__ - 1; \ 866 ((struct pt_regs *)__ptr) - 1; \
858}) 867})
859 868
860#define KSTK_ESP(task) (task_pt_regs(task)->sp) 869#define KSTK_ESP(task) (task_pt_regs(task)->sp)
@@ -886,11 +895,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
886#define STACK_TOP_MAX TASK_SIZE_MAX 895#define STACK_TOP_MAX TASK_SIZE_MAX
887 896
888#define INIT_THREAD { \ 897#define INIT_THREAD { \
889 .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ 898 .sp0 = TOP_OF_INIT_STACK \
890}
891
892#define INIT_TSS { \
893 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
894} 899}
895 900
896/* 901/*
@@ -902,11 +907,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
902#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) 907#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
903extern unsigned long KSTK_ESP(struct task_struct *task); 908extern unsigned long KSTK_ESP(struct task_struct *task);
904 909
905/*
906 * User space RSP while inside the SYSCALL fast path
907 */
908DECLARE_PER_CPU(unsigned long, old_rsp);
909
910#endif /* CONFIG_X86_64 */ 910#endif /* CONFIG_X86_64 */
911 911
912extern void start_thread(struct pt_regs *regs, unsigned long new_ip, 912extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 86fc2bb82287..19507ffa5d28 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@ struct pt_regs {
31#else /* __i386__ */ 31#else /* __i386__ */
32 32
33struct pt_regs { 33struct pt_regs {
34/*
35 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
36 * unless syscall needs a complete, fully filled "struct pt_regs".
37 */
34 unsigned long r15; 38 unsigned long r15;
35 unsigned long r14; 39 unsigned long r14;
36 unsigned long r13; 40 unsigned long r13;
37 unsigned long r12; 41 unsigned long r12;
38 unsigned long bp; 42 unsigned long bp;
39 unsigned long bx; 43 unsigned long bx;
40/* arguments: non interrupts/non tracing syscalls only save up to here*/ 44/* These regs are callee-clobbered. Always saved on kernel entry. */
41 unsigned long r11; 45 unsigned long r11;
42 unsigned long r10; 46 unsigned long r10;
43 unsigned long r9; 47 unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
47 unsigned long dx; 51 unsigned long dx;
48 unsigned long si; 52 unsigned long si;
49 unsigned long di; 53 unsigned long di;
54/*
55 * On syscall entry, this is syscall#. On CPU exception, this is error code.
56 * On hw interrupt, it's IRQ number:
57 */
50 unsigned long orig_ax; 58 unsigned long orig_ax;
51/* end of arguments */ 59/* Return frame for iretq */
52/* cpu exception frame or undefined */
53 unsigned long ip; 60 unsigned long ip;
54 unsigned long cs; 61 unsigned long cs;
55 unsigned long flags; 62 unsigned long flags;
@@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
89} 96}
90 97
91/* 98/*
92 * user_mode_vm(regs) determines whether a register set came from user mode. 99 * user_mode(regs) determines whether a register set came from user
93 * This is true if V8086 mode was enabled OR if the register set was from 100 * mode. On x86_32, this is true if V8086 mode was enabled OR if the
94 * protected mode with RPL-3 CS value. This tricky test checks that with 101 * register set was from protected mode with RPL-3 CS value. This
95 * one comparison. Many places in the kernel can bypass this full check 102 * tricky test checks that with one comparison.
96 * if they have already ruled out V8086 mode, so user_mode(regs) can be used. 103 *
104 * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
105 * the extra check.
97 */ 106 */
98static inline int user_mode(struct pt_regs *regs) 107static inline int user_mode(struct pt_regs *regs)
99{ 108{
@@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs)
104#endif 113#endif
105} 114}
106 115
107static inline int user_mode_vm(struct pt_regs *regs)
108{
109#ifdef CONFIG_X86_32
110 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
111 USER_RPL;
112#else
113 return user_mode(regs);
114#endif
115}
116
117static inline int v8086_mode(struct pt_regs *regs) 116static inline int v8086_mode(struct pt_regs *regs)
118{ 117{
119#ifdef CONFIG_X86_32 118#ifdef CONFIG_X86_32
@@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
138#endif 137#endif
139} 138}
140 139
141#define current_user_stack_pointer() this_cpu_read(old_rsp) 140#define current_user_stack_pointer() current_pt_regs()->sp
142/* ia32 vs. x32 difference */ 141#define compat_user_stack_pointer() current_pt_regs()->sp
143#define compat_user_stack_pointer() \
144 (test_thread_flag(TIF_IA32) \
145 ? current_pt_regs()->sp \
146 : this_cpu_read(old_rsp))
147#endif 142#endif
148 143
149#ifdef CONFIG_X86_32 144#ifdef CONFIG_X86_32
@@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
248 */ 243 */
249#define arch_ptrace_stop_needed(code, info) \ 244#define arch_ptrace_stop_needed(code, info) \
250({ \ 245({ \
251 set_thread_flag(TIF_NOTIFY_RESUME); \ 246 force_iret(); \
252 false; \ 247 false; \
253}) 248})
254 249
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e9fa28..25b1cc07d496 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
95 95
96struct pvclock_vsyscall_time_info { 96struct pvclock_vsyscall_time_info {
97 struct pvclock_vcpu_time_info pvti; 97 struct pvclock_vcpu_time_info pvti;
98 u32 migrate_count;
98} __attribute__((__aligned__(SMP_CACHE_BYTES))); 99} __attribute__((__aligned__(SMP_CACHE_BYTES)));
99 100
100#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) 101#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 0f3d7f099224..0c8c7c8861b4 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -1,5 +1,20 @@
1#ifndef _ASM_X86_SECCOMP_H
2#define _ASM_X86_SECCOMP_H
3
4#include <asm/unistd.h>
5
1#ifdef CONFIG_X86_32 6#ifdef CONFIG_X86_32
2# include <asm/seccomp_32.h> 7#define __NR_seccomp_sigreturn __NR_sigreturn
3#else
4# include <asm/seccomp_64.h>
5#endif 8#endif
9
10#ifdef CONFIG_COMPAT
11#include <asm/ia32_unistd.h>
12#define __NR_seccomp_read_32 __NR_ia32_read
13#define __NR_seccomp_write_32 __NR_ia32_write
14#define __NR_seccomp_exit_32 __NR_ia32_exit
15#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
16#endif
17
18#include <asm-generic/seccomp.h>
19
20#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
deleted file mode 100644
index b811d6f5780c..000000000000
--- a/arch/x86/include/asm/seccomp_32.h
+++ /dev/null
@@ -1,11 +0,0 @@
1#ifndef _ASM_X86_SECCOMP_32_H
2#define _ASM_X86_SECCOMP_32_H
3
4#include <linux/unistd.h>
5
6#define __NR_seccomp_read __NR_read
7#define __NR_seccomp_write __NR_write
8#define __NR_seccomp_exit __NR_exit
9#define __NR_seccomp_sigreturn __NR_sigreturn
10
11#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
deleted file mode 100644
index 84ec1bd161a5..000000000000
--- a/arch/x86/include/asm/seccomp_64.h
+++ /dev/null
@@ -1,17 +0,0 @@
1#ifndef _ASM_X86_SECCOMP_64_H
2#define _ASM_X86_SECCOMP_64_H
3
4#include <linux/unistd.h>
5#include <asm/ia32_unistd.h>
6
7#define __NR_seccomp_read __NR_read
8#define __NR_seccomp_write __NR_write
9#define __NR_seccomp_exit __NR_exit
10#define __NR_seccomp_sigreturn __NR_rt_sigreturn
11
12#define __NR_seccomp_read_32 __NR_ia32_read
13#define __NR_seccomp_write_32 __NR_ia32_write
14#define __NR_seccomp_exit_32 __NR_ia32_exit
15#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
16
17#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index db257a58571f..5a9856eb12ba 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -3,8 +3,10 @@
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5 5
6/* Constructor for a conventional segment GDT (or LDT) entry */ 6/*
7/* This is a macro so it can be used in initializers */ 7 * Constructor for a conventional segment GDT (or LDT) entry.
8 * This is a macro so it can be used in initializers.
9 */
8#define GDT_ENTRY(flags, base, limit) \ 10#define GDT_ENTRY(flags, base, limit) \
9 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ 11 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
10 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ 12 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
@@ -12,198 +14,228 @@
12 (((base) & _AC(0x00ffffff,ULL)) << 16) | \ 14 (((base) & _AC(0x00ffffff,ULL)) << 16) | \
13 (((limit) & _AC(0x0000ffff,ULL)))) 15 (((limit) & _AC(0x0000ffff,ULL))))
14 16
15/* Simple and small GDT entries for booting only */ 17/* Simple and small GDT entries for booting only: */
16 18
17#define GDT_ENTRY_BOOT_CS 2 19#define GDT_ENTRY_BOOT_CS 2
18#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) 20#define GDT_ENTRY_BOOT_DS 3
21#define GDT_ENTRY_BOOT_TSS 4
22#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8)
23#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8)
24#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8)
25
26/*
27 * Bottom two bits of selector give the ring
28 * privilege level
29 */
30#define SEGMENT_RPL_MASK 0x3
19 31
20#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) 32/* User mode is privilege level 3: */
21#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) 33#define USER_RPL 0x3
22 34
23#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) 35/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
24#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) 36#define SEGMENT_TI_MASK 0x4
37/* LDT segment has TI set ... */
38#define SEGMENT_LDT 0x4
39/* ... GDT has it cleared */
40#define SEGMENT_GDT 0x0
25 41
26#define SEGMENT_RPL_MASK 0x3 /* 42#define GDT_ENTRY_INVALID_SEG 0
27 * Bottom two bits of selector give the ring
28 * privilege level
29 */
30#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */
31#define USER_RPL 0x3 /* User mode is privilege level 3 */
32#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */
33#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */
34 43
35#ifdef CONFIG_X86_32 44#ifdef CONFIG_X86_32
36/* 45/*
37 * The layout of the per-CPU GDT under Linux: 46 * The layout of the per-CPU GDT under Linux:
38 * 47 *
39 * 0 - null 48 * 0 - null <=== cacheline #1
40 * 1 - reserved 49 * 1 - reserved
41 * 2 - reserved 50 * 2 - reserved
42 * 3 - reserved 51 * 3 - reserved
43 * 52 *
44 * 4 - unused <==== new cacheline 53 * 4 - unused <=== cacheline #2
45 * 5 - unused 54 * 5 - unused
46 * 55 *
47 * ------- start of TLS (Thread-Local Storage) segments: 56 * ------- start of TLS (Thread-Local Storage) segments:
48 * 57 *
49 * 6 - TLS segment #1 [ glibc's TLS segment ] 58 * 6 - TLS segment #1 [ glibc's TLS segment ]
50 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] 59 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
51 * 8 - TLS segment #3 60 * 8 - TLS segment #3 <=== cacheline #3
52 * 9 - reserved 61 * 9 - reserved
53 * 10 - reserved 62 * 10 - reserved
54 * 11 - reserved 63 * 11 - reserved
55 * 64 *
56 * ------- start of kernel segments: 65 * ------- start of kernel segments:
57 * 66 *
58 * 12 - kernel code segment <==== new cacheline 67 * 12 - kernel code segment <=== cacheline #4
59 * 13 - kernel data segment 68 * 13 - kernel data segment
60 * 14 - default user CS 69 * 14 - default user CS
61 * 15 - default user DS 70 * 15 - default user DS
62 * 16 - TSS 71 * 16 - TSS <=== cacheline #5
63 * 17 - LDT 72 * 17 - LDT
64 * 18 - PNPBIOS support (16->32 gate) 73 * 18 - PNPBIOS support (16->32 gate)
65 * 19 - PNPBIOS support 74 * 19 - PNPBIOS support
66 * 20 - PNPBIOS support 75 * 20 - PNPBIOS support <=== cacheline #6
67 * 21 - PNPBIOS support 76 * 21 - PNPBIOS support
68 * 22 - PNPBIOS support 77 * 22 - PNPBIOS support
69 * 23 - APM BIOS support 78 * 23 - APM BIOS support
70 * 24 - APM BIOS support 79 * 24 - APM BIOS support <=== cacheline #7
71 * 25 - APM BIOS support 80 * 25 - APM BIOS support
72 * 81 *
73 * 26 - ESPFIX small SS 82 * 26 - ESPFIX small SS
74 * 27 - per-cpu [ offset to per-cpu data area ] 83 * 27 - per-cpu [ offset to per-cpu data area ]
75 * 28 - stack_canary-20 [ for stack protector ] 84 * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8
76 * 29 - unused 85 * 29 - unused
77 * 30 - unused 86 * 30 - unused
78 * 31 - TSS for double fault handler 87 * 31 - TSS for double fault handler
79 */ 88 */
80#define GDT_ENTRY_TLS_MIN 6 89#define GDT_ENTRY_TLS_MIN 6
81#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) 90#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
82 91
92#define GDT_ENTRY_KERNEL_CS 12
93#define GDT_ENTRY_KERNEL_DS 13
83#define GDT_ENTRY_DEFAULT_USER_CS 14 94#define GDT_ENTRY_DEFAULT_USER_CS 14
84
85#define GDT_ENTRY_DEFAULT_USER_DS 15 95#define GDT_ENTRY_DEFAULT_USER_DS 15
96#define GDT_ENTRY_TSS 16
97#define GDT_ENTRY_LDT 17
98#define GDT_ENTRY_PNPBIOS_CS32 18
99#define GDT_ENTRY_PNPBIOS_CS16 19
100#define GDT_ENTRY_PNPBIOS_DS 20
101#define GDT_ENTRY_PNPBIOS_TS1 21
102#define GDT_ENTRY_PNPBIOS_TS2 22
103#define GDT_ENTRY_APMBIOS_BASE 23
104
105#define GDT_ENTRY_ESPFIX_SS 26
106#define GDT_ENTRY_PERCPU 27
107#define GDT_ENTRY_STACK_CANARY 28
108
109#define GDT_ENTRY_DOUBLEFAULT_TSS 31
86 110
87#define GDT_ENTRY_KERNEL_BASE (12) 111/*
112 * Number of entries in the GDT table:
113 */
114#define GDT_ENTRIES 32
88 115
89#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) 116/*
117 * Segment selector values corresponding to the above entries:
118 */
90 119
91#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) 120#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
121#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
122#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
123#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
124#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
92 125
93#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) 126/* segment for calling fn: */
94#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) 127#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8)
128/* code segment for BIOS: */
129#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8)
95 130
96#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) 131/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
97#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) 132#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32)
98 133
99#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) 134/* data segment for BIOS: */
100#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) 135#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8)
136/* transfer data segment: */
137#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8)
138/* another data segment: */
139#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8)
101 140
102#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15)
103#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
104#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) 142# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8)
105#else 143#else
106#define __KERNEL_PERCPU 0 144# define __KERNEL_PERCPU 0
107#endif 145#endif
108 146
109#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16)
110#ifdef CONFIG_CC_STACKPROTECTOR 147#ifdef CONFIG_CC_STACKPROTECTOR
111#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) 148# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
112#else 149#else
113#define __KERNEL_STACK_CANARY 0 150# define __KERNEL_STACK_CANARY 0
114#endif 151#endif
115 152
116#define GDT_ENTRY_DOUBLEFAULT_TSS 31 153#else /* 64-bit: */
117
118/*
119 * The GDT has 32 entries
120 */
121#define GDT_ENTRIES 32
122 154
123/* The PnP BIOS entries in the GDT */ 155#include <asm/cache.h>
124#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
125#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
126#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
127#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
128#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
129
130/* The PnP BIOS selectors */
131#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
132#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
133#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
134#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
135#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
136 156
157#define GDT_ENTRY_KERNEL32_CS 1
158#define GDT_ENTRY_KERNEL_CS 2
159#define GDT_ENTRY_KERNEL_DS 3
137 160
138/* 161/*
139 * Matching rules for certain types of segments. 162 * We cannot use the same code segment descriptor for user and kernel mode,
163 * not even in long flat mode, because of different DPL.
164 *
165 * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
166 * selectors:
167 *
168 * if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
169 * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
170 *
171 * ss = STAR.SYSRET_CS+8 (in either case)
172 *
173 * thus USER_DS should be between 32-bit and 64-bit code selectors:
140 */ 174 */
175#define GDT_ENTRY_DEFAULT_USER32_CS 4
176#define GDT_ENTRY_DEFAULT_USER_DS 5
177#define GDT_ENTRY_DEFAULT_USER_CS 6
141 178
142/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ 179/* Needs two entries */
143#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) 180#define GDT_ENTRY_TSS 8
144 181/* Needs two entries */
182#define GDT_ENTRY_LDT 10
145 183
146#else 184#define GDT_ENTRY_TLS_MIN 12
147#include <asm/cache.h> 185#define GDT_ENTRY_TLS_MAX 14
148
149#define GDT_ENTRY_KERNEL32_CS 1
150#define GDT_ENTRY_KERNEL_CS 2
151#define GDT_ENTRY_KERNEL_DS 3
152 186
153#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) 187/* Abused to load per CPU data from limit */
188#define GDT_ENTRY_PER_CPU 15
154 189
155/* 190/*
156 * we cannot use the same code segment descriptor for user and kernel 191 * Number of entries in the GDT table:
157 * -- not even in the long flat mode, because of different DPL /kkeil
158 * The segment offset needs to contain a RPL. Grr. -AK
159 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
160 */ 192 */
161#define GDT_ENTRY_DEFAULT_USER32_CS 4 193#define GDT_ENTRIES 16
162#define GDT_ENTRY_DEFAULT_USER_DS 5
163#define GDT_ENTRY_DEFAULT_USER_CS 6
164#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
165#define __USER32_DS __USER_DS
166
167#define GDT_ENTRY_TSS 8 /* needs two entries */
168#define GDT_ENTRY_LDT 10 /* needs two entries */
169#define GDT_ENTRY_TLS_MIN 12
170#define GDT_ENTRY_TLS_MAX 14
171
172#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
173#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
174 194
175/* TLS indexes for 64bit - hardcoded in arch_prctl */ 195/*
176#define FS_TLS 0 196 * Segment selector values corresponding to the above entries:
177#define GS_TLS 1 197 *
178 198 * Note, selectors also need to have a correct RPL,
179#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) 199 * expressed with the +3 value for user-space selectors:
180#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) 200 */
181 201#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8)
182#define GDT_ENTRIES 16 202#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
203#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
204#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
205#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
206#define __USER32_DS __USER_DS
207#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
208#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3)
209
210/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */
211#define FS_TLS 0
212#define GS_TLS 1
213
214#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
215#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
183 216
184#endif 217#endif
185 218
186#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
187#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
188#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
189#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
190#ifndef CONFIG_PARAVIRT 219#ifndef CONFIG_PARAVIRT
191#define get_kernel_rpl() 0 220# define get_kernel_rpl() 0
192#endif 221#endif
193 222
194#define IDT_ENTRIES 256 223#define IDT_ENTRIES 256
195#define NUM_EXCEPTION_VECTORS 32 224#define NUM_EXCEPTION_VECTORS 32
196/* Bitmask of exception vectors which push an error code on the stack */ 225
197#define EXCEPTION_ERRCODE_MASK 0x00027d00 226/* Bitmask of exception vectors which push an error code on the stack: */
198#define GDT_SIZE (GDT_ENTRIES * 8) 227#define EXCEPTION_ERRCODE_MASK 0x00027d00
199#define GDT_ENTRY_TLS_ENTRIES 3 228
200#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) 229#define GDT_SIZE (GDT_ENTRIES*8)
230#define GDT_ENTRY_TLS_ENTRIES 3
231#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
201 232
202#ifdef __KERNEL__ 233#ifdef __KERNEL__
203#ifndef __ASSEMBLY__ 234#ifndef __ASSEMBLY__
235
204extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; 236extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
205#ifdef CONFIG_TRACING 237#ifdef CONFIG_TRACING
206#define trace_early_idt_handlers early_idt_handlers 238# define trace_early_idt_handlers early_idt_handlers
207#endif 239#endif
208 240
209/* 241/*
@@ -228,37 +260,30 @@ do { \
228} while (0) 260} while (0)
229 261
230/* 262/*
231 * Save a segment register away 263 * Save a segment register away:
232 */ 264 */
233#define savesegment(seg, value) \ 265#define savesegment(seg, value) \
234 asm("mov %%" #seg ",%0":"=r" (value) : : "memory") 266 asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
235 267
236/* 268/*
237 * x86_32 user gs accessors. 269 * x86-32 user GS accessors:
238 */ 270 */
239#ifdef CONFIG_X86_32 271#ifdef CONFIG_X86_32
240#ifdef CONFIG_X86_32_LAZY_GS 272# ifdef CONFIG_X86_32_LAZY_GS
241#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) 273# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
242#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) 274# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
243#define task_user_gs(tsk) ((tsk)->thread.gs) 275# define task_user_gs(tsk) ((tsk)->thread.gs)
244#define lazy_save_gs(v) savesegment(gs, (v)) 276# define lazy_save_gs(v) savesegment(gs, (v))
245#define lazy_load_gs(v) loadsegment(gs, (v)) 277# define lazy_load_gs(v) loadsegment(gs, (v))
246#else /* X86_32_LAZY_GS */ 278# else /* X86_32_LAZY_GS */
247#define get_user_gs(regs) (u16)((regs)->gs) 279# define get_user_gs(regs) (u16)((regs)->gs)
248#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) 280# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
249#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) 281# define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
250#define lazy_save_gs(v) do { } while (0) 282# define lazy_save_gs(v) do { } while (0)
251#define lazy_load_gs(v) do { } while (0) 283# define lazy_load_gs(v) do { } while (0)
252#endif /* X86_32_LAZY_GS */ 284# endif /* X86_32_LAZY_GS */
253#endif /* X86_32 */ 285#endif /* X86_32 */
254 286
255static inline unsigned long get_limit(unsigned long segment)
256{
257 unsigned long __limit;
258 asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
259 return __limit + 1;
260}
261
262#endif /* !__ASSEMBLY__ */ 287#endif /* !__ASSEMBLY__ */
263#endif /* __KERNEL__ */ 288#endif /* __KERNEL__ */
264 289
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ff4e7b236e21..f69e06b283fb 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { }
66 */ 66 */
67extern struct boot_params boot_params; 67extern struct boot_params boot_params;
68 68
69static inline bool kaslr_enabled(void)
70{
71 return !!(boot_params.hdr.loadflags & KASLR_FLAG);
72}
73
69/* 74/*
70 * Do NOT EVER look at the BIOS memory size location. 75 * Do NOT EVER look at the BIOS memory size location.
71 * It does not work on many machines. 76 * It does not work on many machines.
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 9dfce4e0417d..6fe6b182c998 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,9 +57,9 @@ struct sigcontext {
57 unsigned long ip; 57 unsigned long ip;
58 unsigned long flags; 58 unsigned long flags;
59 unsigned short cs; 59 unsigned short cs;
60 unsigned short gs; 60 unsigned short __pad2; /* Was called gs, but was always zero. */
61 unsigned short fs; 61 unsigned short __pad1; /* Was called fs, but was always zero. */
62 unsigned short __pad0; 62 unsigned short ss;
63 unsigned long err; 63 unsigned long err;
64 unsigned long trapno; 64 unsigned long trapno;
65 unsigned long oldmask; 65 unsigned long oldmask;
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 7a958164088c..89db46752a8f 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,9 +13,7 @@
13 X86_EFLAGS_CF | X86_EFLAGS_RF) 13 X86_EFLAGS_CF | X86_EFLAGS_RF)
14 14
15void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 15void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
16 16int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
17int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
18 unsigned long *pax);
19int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 17int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
20 struct pt_regs *regs, unsigned long mask); 18 struct pt_regs *regs, unsigned long mask);
21 19
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f4e270..ba665ebd17bb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
27 27
28#ifdef CONFIG_X86_SMAP 28#ifdef CONFIG_X86_SMAP
29 29
30#define ASM_CLAC \ 30#define ASM_CLAC \
31 661: ASM_NOP3 ; \ 31 ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
32 .pushsection .altinstr_replacement, "ax" ; \ 32
33 662: __ASM_CLAC ; \ 33#define ASM_STAC \
34 .popsection ; \ 34 ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
35 .pushsection .altinstructions, "a" ; \
36 altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
37 .popsection
38
39#define ASM_STAC \
40 661: ASM_NOP3 ; \
41 .pushsection .altinstr_replacement, "ax" ; \
42 662: __ASM_STAC ; \
43 .popsection ; \
44 .pushsection .altinstructions, "a" ; \
45 altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
46 .popsection
47 35
48#else /* CONFIG_X86_SMAP */ 36#else /* CONFIG_X86_SMAP */
49 37
@@ -61,20 +49,20 @@
61static __always_inline void clac(void) 49static __always_inline void clac(void)
62{ 50{
63 /* Note: a barrier is implicit in alternative() */ 51 /* Note: a barrier is implicit in alternative() */
64 alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); 52 alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
65} 53}
66 54
67static __always_inline void stac(void) 55static __always_inline void stac(void)
68{ 56{
69 /* Note: a barrier is implicit in alternative() */ 57 /* Note: a barrier is implicit in alternative() */
70 alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); 58 alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
71} 59}
72 60
73/* These macros can be used in asm() statements */ 61/* These macros can be used in asm() statements */
74#define ASM_CLAC \ 62#define ASM_CLAC \
75 ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) 63 ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
76#define ASM_STAC \ 64#define ASM_STAC \
77 ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) 65 ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
78 66
79#else /* CONFIG_X86_SMAP */ 67#else /* CONFIG_X86_SMAP */
80 68
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..17a8dced12da 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,13 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
150} 150}
151 151
152void cpu_disable_common(void); 152void cpu_disable_common(void);
153void cpu_die_common(unsigned int cpu);
154void native_smp_prepare_boot_cpu(void); 153void native_smp_prepare_boot_cpu(void);
155void native_smp_prepare_cpus(unsigned int max_cpus); 154void native_smp_prepare_cpus(unsigned int max_cpus);
156void native_smp_cpus_done(unsigned int max_cpus); 155void native_smp_cpus_done(unsigned int max_cpus);
156void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
157int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); 157int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
158int native_cpu_disable(void); 158int native_cpu_disable(void);
159int common_cpu_die(unsigned int cpu);
159void native_cpu_die(unsigned int cpu); 160void native_cpu_die(unsigned int cpu);
160void native_play_dead(void); 161void native_play_dead(void);
161void play_dead_common(void); 162void play_dead_common(void);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 6a4b00fafb00..aeb4666e0c0a 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -4,6 +4,8 @@
4 4
5#ifdef __KERNEL__ 5#ifdef __KERNEL__
6 6
7#include <asm/nops.h>
8
7static inline void native_clts(void) 9static inline void native_clts(void)
8{ 10{
9 asm volatile("clts"); 11 asm volatile("clts");
@@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p)
199 "+m" (*(volatile char __force *)__p)); 201 "+m" (*(volatile char __force *)__p));
200} 202}
201 203
204static inline void clwb(volatile void *__p)
205{
206 volatile struct { char x[64]; } *p = __p;
207
208 asm volatile(ALTERNATIVE_2(
209 ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
210 ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
211 X86_FEATURE_CLFLUSHOPT,
212 ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */
213 X86_FEATURE_CLWB)
214 : [p] "+m" (*p)
215 : [pax] "a" (p));
216}
217
218static inline void pcommit_sfence(void)
219{
220 alternative(ASM_NOP7,
221 ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
222 "sfence",
223 X86_FEATURE_PCOMMIT);
224}
225
202#define nop() asm volatile ("nop") 226#define nop() asm volatile ("nop")
203 227
204 228
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f279a32..b4bdec3e9523 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -13,19 +13,44 @@
13#include <asm/types.h> 13#include <asm/types.h>
14 14
15/* 15/*
16 * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
17 * reserve at the top of the kernel stack. We do it because of a nasty
18 * 32-bit corner case. On x86_32, the hardware stack frame is
19 * variable-length. Except for vm86 mode, struct pt_regs assumes a
20 * maximum-length frame. If we enter from CPL 0, the top 8 bytes of
21 * pt_regs don't actually exist. Ordinarily this doesn't matter, but it
22 * does in at least one case:
23 *
24 * If we take an NMI early enough in SYSENTER, then we can end up with
25 * pt_regs that extends above sp0. On the way out, in the espfix code,
26 * we can read the saved SS value, but that value will be above sp0.
27 * Without this offset, that can result in a page fault. (We are
28 * careful that, in this case, the value we read doesn't matter.)
29 *
30 * In vm86 mode, the hardware frame is much longer still, but we neither
31 * access the extra members from NMI context, nor do we write such a
32 * frame at sp0 at all.
33 *
34 * x86_64 has a fixed-length stack frame.
35 */
36#ifdef CONFIG_X86_32
37# define TOP_OF_KERNEL_STACK_PADDING 8
38#else
39# define TOP_OF_KERNEL_STACK_PADDING 0
40#endif
41
42/*
16 * low level task data that entry.S needs immediate access to 43 * low level task data that entry.S needs immediate access to
17 * - this struct should fit entirely inside of one cache line 44 * - this struct should fit entirely inside of one cache line
18 * - this struct shares the supervisor stack pages 45 * - this struct shares the supervisor stack pages
19 */ 46 */
20#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
21struct task_struct; 48struct task_struct;
22struct exec_domain;
23#include <asm/processor.h> 49#include <asm/processor.h>
24#include <linux/atomic.h> 50#include <linux/atomic.h>
25 51
26struct thread_info { 52struct thread_info {
27 struct task_struct *task; /* main task structure */ 53 struct task_struct *task; /* main task structure */
28 struct exec_domain *exec_domain; /* execution domain */
29 __u32 flags; /* low level flags */ 54 __u32 flags; /* low level flags */
30 __u32 status; /* thread synchronous flags */ 55 __u32 status; /* thread synchronous flags */
31 __u32 cpu; /* current CPU */ 56 __u32 cpu; /* current CPU */
@@ -39,7 +64,6 @@ struct thread_info {
39#define INIT_THREAD_INFO(tsk) \ 64#define INIT_THREAD_INFO(tsk) \
40{ \ 65{ \
41 .task = &tsk, \ 66 .task = &tsk, \
42 .exec_domain = &default_exec_domain, \
43 .flags = 0, \ 67 .flags = 0, \
44 .cpu = 0, \ 68 .cpu = 0, \
45 .saved_preempt_count = INIT_PREEMPT_COUNT, \ 69 .saved_preempt_count = INIT_PREEMPT_COUNT, \
@@ -145,7 +169,6 @@ struct thread_info {
145#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) 169#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
146 170
147#define STACK_WARN (THREAD_SIZE/8) 171#define STACK_WARN (THREAD_SIZE/8)
148#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
149 172
150/* 173/*
151 * macros/functions for gaining access to the thread information structure 174 * macros/functions for gaining access to the thread information structure
@@ -158,10 +181,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
158 181
159static inline struct thread_info *current_thread_info(void) 182static inline struct thread_info *current_thread_info(void)
160{ 183{
161 struct thread_info *ti; 184 return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
162 ti = (void *)(this_cpu_read_stable(kernel_stack) +
163 KERNEL_STACK_OFFSET - THREAD_SIZE);
164 return ti;
165} 185}
166 186
167static inline unsigned long current_stack_pointer(void) 187static inline unsigned long current_stack_pointer(void)
@@ -177,16 +197,37 @@ static inline unsigned long current_stack_pointer(void)
177 197
178#else /* !__ASSEMBLY__ */ 198#else /* !__ASSEMBLY__ */
179 199
180/* how to get the thread information struct from ASM */ 200/* Load thread_info address into "reg" */
181#define GET_THREAD_INFO(reg) \ 201#define GET_THREAD_INFO(reg) \
182 _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ 202 _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
183 _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; 203 _ASM_SUB $(THREAD_SIZE),reg ;
184 204
185/* 205/*
186 * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in 206 * ASM operand which evaluates to a 'thread_info' address of
187 * a certain register (to be used in assembler memory operands). 207 * the current task, if it is known that "reg" is exactly "off"
208 * bytes below the top of the stack currently.
209 *
210 * ( The kernel stack's size is known at build time, it is usually
211 * 2 or 4 pages, and the bottom of the kernel stack contains
212 * the thread_info structure. So to access the thread_info very
213 * quickly from assembly code we can calculate down from the
214 * top of the kernel stack to the bottom, using constant,
215 * build-time calculations only. )
216 *
217 * For example, to fetch the current thread_info->flags value into %eax
218 * on x86-64 defconfig kernels, in syscall entry code where RSP is
219 * currently at exactly SIZEOF_PTREGS bytes away from the top of the
220 * stack:
221 *
222 * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
223 *
224 * will translate to:
225 *
226 * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax
227 *
228 * which is below the current RSP by almost 16K.
188 */ 229 */
189#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) 230#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
190 231
191#endif 232#endif
192 233
@@ -236,6 +277,16 @@ static inline bool is_ia32_task(void)
236#endif 277#endif
237 return false; 278 return false;
238} 279}
280
281/*
282 * Force syscall return via IRET by making it look as if there was
283 * some work pending. IRET is our most capable (but slowest) syscall
284 * return path, which is able to restore modified SS, CS and certain
285 * EFLAGS values that other (fast) syscall return instructions
286 * are not able to restore properly.
287 */
288#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
289
239#endif /* !__ASSEMBLY__ */ 290#endif /* !__ASSEMBLY__ */
240 291
241#ifndef __ASSEMBLY__ 292#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 225b0988043a..ab456dc233b5 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -15,6 +15,7 @@
15 15
16/* loadflags */ 16/* loadflags */
17#define LOADED_HIGH (1<<0) 17#define LOADED_HIGH (1<<0)
18#define KASLR_FLAG (1<<1)
18#define QUIET_FLAG (1<<5) 19#define QUIET_FLAG (1<<5)
19#define KEEP_SEGMENTS (1<<6) 20#define KEEP_SEGMENTS (1<<6)
20#define CAN_USE_HEAP (1<<7) 21#define CAN_USE_HEAP (1<<7)
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index d993e33f5236..960a8a9dc4ab 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -33,6 +33,16 @@
33#define E820_NVS 4 33#define E820_NVS 4
34#define E820_UNUSABLE 5 34#define E820_UNUSABLE 5
35 35
36/*
37 * This is a non-standardized way to represent ADR or NVDIMM regions that
38 * persist over a reboot. The kernel will ignore their special capabilities
39 * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
40 *
41 * ( Note that older platforms also used 6 for the same type of memory,
42 * but newer versions switched to 12 as 6 was assigned differently. Some
43 * time they will learn... )
44 */
45#define E820_PRAM 12
36 46
37/* 47/*
38 * reserved RAM used by kernel itself 48 * reserved RAM used by kernel itself
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index fe01b0a784e7..c469490db4a8 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -77,6 +77,24 @@
77#define MSR_IA32_PERF_CAPABILITIES 0x00000345 77#define MSR_IA32_PERF_CAPABILITIES 0x00000345
78#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 78#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
79 79
80#define MSR_IA32_RTIT_CTL 0x00000570
81#define RTIT_CTL_TRACEEN BIT(0)
82#define RTIT_CTL_OS BIT(2)
83#define RTIT_CTL_USR BIT(3)
84#define RTIT_CTL_CR3EN BIT(7)
85#define RTIT_CTL_TOPA BIT(8)
86#define RTIT_CTL_TSC_EN BIT(10)
87#define RTIT_CTL_DISRETC BIT(11)
88#define RTIT_CTL_BRANCH_EN BIT(13)
89#define MSR_IA32_RTIT_STATUS 0x00000571
90#define RTIT_STATUS_CONTEXTEN BIT(1)
91#define RTIT_STATUS_TRIGGEREN BIT(2)
92#define RTIT_STATUS_ERROR BIT(4)
93#define RTIT_STATUS_STOPPED BIT(5)
94#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
95#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
96#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
97
80#define MSR_MTRRfix64K_00000 0x00000250 98#define MSR_MTRRfix64K_00000 0x00000250
81#define MSR_MTRRfix16K_80000 0x00000258 99#define MSR_MTRRfix16K_80000 0x00000258
82#define MSR_MTRRfix16K_A0000 0x00000259 100#define MSR_MTRRfix16K_A0000 0x00000259
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a88851..580aee3072e0 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
25#else /* __i386__ */ 25#else /* __i386__ */
26 26
27#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) 27#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
28/*
29 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
30 * unless syscall needs a complete, fully filled "struct pt_regs".
31 */
28#define R15 0 32#define R15 0
29#define R14 8 33#define R14 8
30#define R13 16 34#define R13 16
31#define R12 24 35#define R12 24
32#define RBP 32 36#define RBP 32
33#define RBX 40 37#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save up to here*/ 38/* These regs are callee-clobbered. Always saved on kernel entry. */
35#define R11 48 39#define R11 48
36#define R10 56 40#define R10 56
37#define R9 64 41#define R9 64
@@ -41,15 +45,17 @@
41#define RDX 96 45#define RDX 96
42#define RSI 104 46#define RSI 104
43#define RDI 112 47#define RDI 112
44#define ORIG_RAX 120 /* = ERROR */ 48/*
45/* end of arguments */ 49 * On syscall entry, this is syscall#. On CPU exception, this is error code.
46/* cpu exception frame or undefined in case of fast syscall. */ 50 * On hw interrupt, it's IRQ number:
51 */
52#define ORIG_RAX 120
53/* Return frame for iretq */
47#define RIP 128 54#define RIP 128
48#define CS 136 55#define CS 136
49#define EFLAGS 144 56#define EFLAGS 144
50#define RSP 152 57#define RSP 152
51#define SS 160 58#define SS 160
52#define ARGOFFSET R11
53#endif /* __ASSEMBLY__ */ 59#endif /* __ASSEMBLY__ */
54 60
55/* top of stack page */ 61/* top of stack page */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa4d999..bc16115af39b 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@ struct pt_regs {
41#ifndef __KERNEL__ 41#ifndef __KERNEL__
42 42
43struct pt_regs { 43struct pt_regs {
44/*
45 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
46 * unless syscall needs a complete, fully filled "struct pt_regs".
47 */
44 unsigned long r15; 48 unsigned long r15;
45 unsigned long r14; 49 unsigned long r14;
46 unsigned long r13; 50 unsigned long r13;
47 unsigned long r12; 51 unsigned long r12;
48 unsigned long rbp; 52 unsigned long rbp;
49 unsigned long rbx; 53 unsigned long rbx;
50/* arguments: non interrupts/non tracing syscalls only save up to here*/ 54/* These regs are callee-clobbered. Always saved on kernel entry. */
51 unsigned long r11; 55 unsigned long r11;
52 unsigned long r10; 56 unsigned long r10;
53 unsigned long r9; 57 unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
57 unsigned long rdx; 61 unsigned long rdx;
58 unsigned long rsi; 62 unsigned long rsi;
59 unsigned long rdi; 63 unsigned long rdi;
64/*
65 * On syscall entry, this is syscall#. On CPU exception, this is error code.
66 * On hw interrupt, it's IRQ number:
67 */
60 unsigned long orig_rax; 68 unsigned long orig_rax;
61/* end of arguments */ 69/* Return frame for iretq */
62/* cpu exception frame or undefined */
63 unsigned long rip; 70 unsigned long rip;
64 unsigned long cs; 71 unsigned long cs;
65 unsigned long eflags; 72 unsigned long eflags;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d8b9f9081e86..16dc4e8a2cd3 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,9 +177,24 @@ struct sigcontext {
177 __u64 rip; 177 __u64 rip;
178 __u64 eflags; /* RFLAGS */ 178 __u64 eflags; /* RFLAGS */
179 __u16 cs; 179 __u16 cs;
180 __u16 gs; 180
181 __u16 fs; 181 /*
182 __u16 __pad0; 182 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
183 * Linux saved and restored fs and gs in these slots. This
184 * was counterproductive, as fsbase and gsbase were never
185 * saved, so arch_prctl was presumably unreliable.
186 *
187 * If these slots are ever needed for any other purpose, there
188 * is some risk that very old 64-bit binaries could get
189 * confused. I doubt that many such binaries still work,
190 * though, since the same patch in 2.5.64 also removed the
191 * 64-bit set_thread_area syscall, so it appears that there is
192 * no TLS API that works in both pre- and post-2.5.64 kernels.
193 */
194 __u16 __pad2; /* Was gs. */
195 __u16 __pad1; /* Was fs. */
196
197 __u16 ss;
183 __u64 err; 198 __u64 err;
184 __u64 trapno; 199 __u64 trapno;
185 __u64 oldmask; 200 __u64 oldmask;
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index c5f1a1deb91a..1fe92181ee9e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -67,6 +67,7 @@
67#define EXIT_REASON_EPT_VIOLATION 48 67#define EXIT_REASON_EPT_VIOLATION 48
68#define EXIT_REASON_EPT_MISCONFIG 49 68#define EXIT_REASON_EPT_MISCONFIG 49
69#define EXIT_REASON_INVEPT 50 69#define EXIT_REASON_INVEPT 50
70#define EXIT_REASON_RDTSCP 51
70#define EXIT_REASON_PREEMPTION_TIMER 52 71#define EXIT_REASON_PREEMPTION_TIMER 52
71#define EXIT_REASON_INVVPID 53 72#define EXIT_REASON_INVVPID 53
72#define EXIT_REASON_WBINVD 54 73#define EXIT_REASON_WBINVD 54
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70ddad0..9bcd0b56ca17 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
33obj-$(CONFIG_X86_64) += mcount_64.o 33obj-$(CONFIG_X86_64) += mcount_64.o
34obj-y += syscall_$(BITS).o vsyscall_gtod.o 34obj-y += syscall_$(BITS).o vsyscall_gtod.o
35obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
35obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o 36obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
36obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o 37obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
37obj-$(CONFIG_SYSFS) += ksysfs.o 38obj-$(CONFIG_SYSFS) += ksysfs.o
@@ -94,6 +95,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
94obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 95obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
95obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o 96obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
96obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 97obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
98obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o
97 99
98obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 100obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
99 101
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..aef653193160 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
52__setup("noreplace-paravirt", setup_noreplace_paravirt); 52__setup("noreplace-paravirt", setup_noreplace_paravirt);
53#endif 53#endif
54 54
55#define DPRINTK(fmt, ...) \ 55#define DPRINTK(fmt, args...) \
56do { \ 56do { \
57 if (debug_alternative) \ 57 if (debug_alternative) \
58 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ 58 printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
59} while (0)
60
61#define DUMP_BYTES(buf, len, fmt, args...) \
62do { \
63 if (unlikely(debug_alternative)) { \
64 int j; \
65 \
66 if (!(len)) \
67 break; \
68 \
69 printk(KERN_DEBUG fmt, ##args); \
70 for (j = 0; j < (len) - 1; j++) \
71 printk(KERN_CONT "%02hhx ", buf[j]); \
72 printk(KERN_CONT "%02hhx\n", buf[j]); \
73 } \
59} while (0) 74} while (0)
60 75
61/* 76/*
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
243extern s32 __smp_locks[], __smp_locks_end[]; 258extern s32 __smp_locks[], __smp_locks_end[];
244void *text_poke_early(void *addr, const void *opcode, size_t len); 259void *text_poke_early(void *addr, const void *opcode, size_t len);
245 260
246/* Replace instructions with better alternatives for this CPU type. 261/*
247 This runs before SMP is initialized to avoid SMP problems with 262 * Are we looking at a near JMP with a 1 or 4-byte displacement.
248 self modifying code. This implies that asymmetric systems where 263 */
249 APs have less capabilities than the boot processor are not handled. 264static inline bool is_jmp(const u8 opcode)
250 Tough. Make sure you disable such features by hand. */ 265{
266 return opcode == 0xeb || opcode == 0xe9;
267}
268
269static void __init_or_module
270recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
271{
272 u8 *next_rip, *tgt_rip;
273 s32 n_dspl, o_dspl;
274 int repl_len;
275
276 if (a->replacementlen != 5)
277 return;
278
279 o_dspl = *(s32 *)(insnbuf + 1);
280
281 /* next_rip of the replacement JMP */
282 next_rip = repl_insn + a->replacementlen;
283 /* target rip of the replacement JMP */
284 tgt_rip = next_rip + o_dspl;
285 n_dspl = tgt_rip - orig_insn;
286
287 DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
288
289 if (tgt_rip - orig_insn >= 0) {
290 if (n_dspl - 2 <= 127)
291 goto two_byte_jmp;
292 else
293 goto five_byte_jmp;
294 /* negative offset */
295 } else {
296 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
297 goto two_byte_jmp;
298 else
299 goto five_byte_jmp;
300 }
301
302two_byte_jmp:
303 n_dspl -= 2;
304
305 insnbuf[0] = 0xeb;
306 insnbuf[1] = (s8)n_dspl;
307 add_nops(insnbuf + 2, 3);
308
309 repl_len = 2;
310 goto done;
311
312five_byte_jmp:
313 n_dspl -= 5;
314
315 insnbuf[0] = 0xe9;
316 *(s32 *)&insnbuf[1] = n_dspl;
251 317
318 repl_len = 5;
319
320done:
321
322 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
323 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
324}
325
326static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
327{
328 if (instr[0] != 0x90)
329 return;
330
331 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
332
333 DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
334 instr, a->instrlen - a->padlen, a->padlen);
335}
336
337/*
338 * Replace instructions with better alternatives for this CPU type. This runs
339 * before SMP is initialized to avoid SMP problems with self modifying code.
340 * This implies that asymmetric systems where APs have less capabilities than
341 * the boot processor are not handled. Tough. Make sure you disable such
342 * features by hand.
343 */
252void __init_or_module apply_alternatives(struct alt_instr *start, 344void __init_or_module apply_alternatives(struct alt_instr *start,
253 struct alt_instr *end) 345 struct alt_instr *end)
254{ 346{
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
256 u8 *instr, *replacement; 348 u8 *instr, *replacement;
257 u8 insnbuf[MAX_PATCH_LEN]; 349 u8 insnbuf[MAX_PATCH_LEN];
258 350
259 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 351 DPRINTK("alt table %p -> %p", start, end);
260 /* 352 /*
261 * The scan order should be from start to end. A later scanned 353 * The scan order should be from start to end. A later scanned
262 * alternative code can overwrite a previous scanned alternative code. 354 * alternative code can overwrite previously scanned alternative code.
263 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 355 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
264 * patch code. 356 * patch code.
265 * 357 *
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
267 * order. 359 * order.
268 */ 360 */
269 for (a = start; a < end; a++) { 361 for (a = start; a < end; a++) {
362 int insnbuf_sz = 0;
363
270 instr = (u8 *)&a->instr_offset + a->instr_offset; 364 instr = (u8 *)&a->instr_offset + a->instr_offset;
271 replacement = (u8 *)&a->repl_offset + a->repl_offset; 365 replacement = (u8 *)&a->repl_offset + a->repl_offset;
272 BUG_ON(a->replacementlen > a->instrlen);
273 BUG_ON(a->instrlen > sizeof(insnbuf)); 366 BUG_ON(a->instrlen > sizeof(insnbuf));
274 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 367 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
275 if (!boot_cpu_has(a->cpuid)) 368 if (!boot_cpu_has(a->cpuid)) {
369 if (a->padlen > 1)
370 optimize_nops(a, instr);
371
276 continue; 372 continue;
373 }
374
375 DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
376 a->cpuid >> 5,
377 a->cpuid & 0x1f,
378 instr, a->instrlen,
379 replacement, a->replacementlen, a->padlen);
380
381 DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
382 DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
277 383
278 memcpy(insnbuf, replacement, a->replacementlen); 384 memcpy(insnbuf, replacement, a->replacementlen);
385 insnbuf_sz = a->replacementlen;
279 386
280 /* 0xe8 is a relative jump; fix the offset. */ 387 /* 0xe8 is a relative jump; fix the offset. */
281 if (*insnbuf == 0xe8 && a->replacementlen == 5) 388 if (*insnbuf == 0xe8 && a->replacementlen == 5) {
282 *(s32 *)(insnbuf + 1) += replacement - instr; 389 *(s32 *)(insnbuf + 1) += replacement - instr;
390 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
391 *(s32 *)(insnbuf + 1),
392 (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
393 }
394
395 if (a->replacementlen && is_jmp(replacement[0]))
396 recompute_jump(a, instr, replacement, insnbuf);
283 397
284 add_nops(insnbuf + a->replacementlen, 398 if (a->instrlen > a->replacementlen) {
285 a->instrlen - a->replacementlen); 399 add_nops(insnbuf + a->replacementlen,
400 a->instrlen - a->replacementlen);
401 insnbuf_sz += a->instrlen - a->replacementlen;
402 }
403 DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
286 404
287 text_poke_early(instr, insnbuf, a->instrlen); 405 text_poke_early(instr, insnbuf, insnbuf_sz);
288 } 406 }
289} 407}
290 408
291#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
292
293static void alternatives_smp_lock(const s32 *start, const s32 *end, 410static void alternatives_smp_lock(const s32 *start, const s32 *end,
294 u8 *text, u8 *text_end) 411 u8 *text, u8 *text_end)
295{ 412{
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
371 smp->locks_end = locks_end; 488 smp->locks_end = locks_end;
372 smp->text = text; 489 smp->text = text;
373 smp->text_end = text_end; 490 smp->text_end = text_end;
374 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", 491 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
375 __func__, smp->locks, smp->locks_end, 492 smp->locks, smp->locks_end,
376 smp->text, smp->text_end, smp->name); 493 smp->text, smp->text_end, smp->name);
377 494
378 list_add_tail(&smp->next, &smp_alt_modules); 495 list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end)
440 557
441 return 0; 558 return 0;
442} 559}
443#endif 560#endif /* CONFIG_SMP */
444 561
445#ifdef CONFIG_PARAVIRT 562#ifdef CONFIG_PARAVIRT
446void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 563void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs)
601 if (likely(!bp_patching_in_progress)) 718 if (likely(!bp_patching_in_progress))
602 return 0; 719 return 0;
603 720
604 if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) 721 if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
605 return 0; 722 return 0;
606 723
607 /* set up the specified breakpoint handler */ 724 /* set up the specified breakpoint handler */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad3639ae1b9b..dcb52850a28f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1084,67 +1084,6 @@ void lapic_shutdown(void)
1084 local_irq_restore(flags); 1084 local_irq_restore(flags);
1085} 1085}
1086 1086
1087/*
1088 * This is to verify that we're looking at a real local APIC.
1089 * Check these against your board if the CPUs aren't getting
1090 * started for no apparent reason.
1091 */
1092int __init verify_local_APIC(void)
1093{
1094 unsigned int reg0, reg1;
1095
1096 /*
1097 * The version register is read-only in a real APIC.
1098 */
1099 reg0 = apic_read(APIC_LVR);
1100 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
1101 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
1102 reg1 = apic_read(APIC_LVR);
1103 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
1104
1105 /*
1106 * The two version reads above should print the same
1107 * numbers. If the second one is different, then we
1108 * poke at a non-APIC.
1109 */
1110 if (reg1 != reg0)
1111 return 0;
1112
1113 /*
1114 * Check if the version looks reasonably.
1115 */
1116 reg1 = GET_APIC_VERSION(reg0);
1117 if (reg1 == 0x00 || reg1 == 0xff)
1118 return 0;
1119 reg1 = lapic_get_maxlvt();
1120 if (reg1 < 0x02 || reg1 == 0xff)
1121 return 0;
1122
1123 /*
1124 * The ID register is read/write in a real APIC.
1125 */
1126 reg0 = apic_read(APIC_ID);
1127 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
1128 apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
1129 reg1 = apic_read(APIC_ID);
1130 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
1131 apic_write(APIC_ID, reg0);
1132 if (reg1 != (reg0 ^ apic->apic_id_mask))
1133 return 0;
1134
1135 /*
1136 * The next two are just to see if we have sane values.
1137 * They're only really relevant if we're in Virtual Wire
1138 * compatibility mode, but most boxes are anymore.
1139 */
1140 reg0 = apic_read(APIC_LVT0);
1141 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
1142 reg1 = apic_read(APIC_LVT1);
1143 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
1144
1145 return 1;
1146}
1147
1148/** 1087/**
1149 * sync_Arb_IDs - synchronize APIC bus arbitration IDs 1088 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
1150 */ 1089 */
@@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void)
2283 disable_ioapic_support(); 2222 disable_ioapic_support();
2284 2223
2285 default_setup_apic_routing(); 2224 default_setup_apic_routing();
2286 verify_local_APIC();
2287 apic_bsp_setup(true); 2225 apic_bsp_setup(true);
2288 return 0; 2226 return 0;
2289} 2227}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e658f21681c8..d9d0bd2faaf4 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -135,12 +135,12 @@ static void init_x2apic_ldr(void)
135 135
136 per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); 136 per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
137 137
138 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); 138 cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
139 for_each_online_cpu(cpu) { 139 for_each_online_cpu(cpu) {
140 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) 140 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
141 continue; 141 continue;
142 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); 142 cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
143 __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); 143 cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
144 } 144 }
145} 145}
146 146
@@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void)
195 195
196 BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); 196 BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
197 197
198 __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); 198 cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
199 register_hotcpu_notifier(&x2apic_cpu_notifier); 199 register_hotcpu_notifier(&x2apic_cpu_notifier);
200 return 1; 200 return 1;
201} 201}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8e9dcfd630e4..c8d92950bc04 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -144,33 +144,60 @@ static void __init uv_set_apicid_hibit(void)
144 144
145static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 145static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
146{ 146{
147 int pnodeid, is_uv1, is_uv2, is_uv3; 147 int pnodeid;
148 148 int uv_apic;
149 is_uv1 = !strcmp(oem_id, "SGI"); 149
150 is_uv2 = !strcmp(oem_id, "SGI2"); 150 if (strncmp(oem_id, "SGI", 3) != 0)
151 is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */ 151 return 0;
152 if (is_uv1 || is_uv2 || is_uv3) { 152
153 uv_hub_info->hub_revision = 153 /*
154 (is_uv1 ? UV1_HUB_REVISION_BASE : 154 * Determine UV arch type.
155 (is_uv2 ? UV2_HUB_REVISION_BASE : 155 * SGI: UV100/1000
156 UV3_HUB_REVISION_BASE)); 156 * SGI2: UV2000/3000
157 pnodeid = early_get_pnodeid(); 157 * SGI3: UV300 (truncated to 4 chars because of different varieties)
158 early_get_apic_pnode_shift(); 158 */
159 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 159 uv_hub_info->hub_revision =
160 x86_platform.nmi_init = uv_nmi_init; 160 !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
161 if (!strcmp(oem_table_id, "UVL")) 161 !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
162 uv_system_type = UV_LEGACY_APIC; 162 !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
163 else if (!strcmp(oem_table_id, "UVX")) 163
164 uv_system_type = UV_X2APIC; 164 if (uv_hub_info->hub_revision == 0)
165 else if (!strcmp(oem_table_id, "UVH")) { 165 goto badbios;
166 __this_cpu_write(x2apic_extra_bits, 166
167 pnodeid << uvh_apicid.s.pnode_shift); 167 pnodeid = early_get_pnodeid();
168 uv_system_type = UV_NON_UNIQUE_APIC; 168 early_get_apic_pnode_shift();
169 uv_set_apicid_hibit(); 169 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
170 return 1; 170 x86_platform.nmi_init = uv_nmi_init;
171 } 171
172 if (!strcmp(oem_table_id, "UVX")) { /* most common */
173 uv_system_type = UV_X2APIC;
174 uv_apic = 0;
175
176 } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */
177 uv_system_type = UV_NON_UNIQUE_APIC;
178 __this_cpu_write(x2apic_extra_bits,
179 pnodeid << uvh_apicid.s.pnode_shift);
180 uv_set_apicid_hibit();
181 uv_apic = 1;
182
183 } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */
184 uv_system_type = UV_LEGACY_APIC; /* very small systems */
185 uv_apic = 0;
186
187 } else {
188 goto badbios;
172 } 189 }
173 return 0; 190
191 pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
192 oem_id, oem_table_id, uv_system_type,
193 uv_min_hub_revision_id, uv_apic);
194
195 return uv_apic;
196
197badbios:
198 pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
199 pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
200 BUG();
174} 201}
175 202
176enum uv_system_type get_uv_system_type(void) 203enum uv_system_type get_uv_system_type(void)
@@ -854,10 +881,14 @@ void __init uv_system_init(void)
854 unsigned long mmr_base, present, paddr; 881 unsigned long mmr_base, present, paddr;
855 unsigned short pnode_mask; 882 unsigned short pnode_mask;
856 unsigned char n_lshift; 883 unsigned char n_lshift;
857 char *hub = (is_uv1_hub() ? "UV1" : 884 char *hub = (is_uv1_hub() ? "UV100/1000" :
858 (is_uv2_hub() ? "UV2" : 885 (is_uv2_hub() ? "UV2000/3000" :
859 "UV3")); 886 (is_uv3_hub() ? "UV300" : NULL)));
860 887
888 if (!hub) {
889 pr_err("UV: Unknown/unsupported UV hub\n");
890 return;
891 }
861 pr_info("UV: Found %s hub\n", hub); 892 pr_info("UV: Found %s hub\n", hub);
862 map_low_mmrs(); 893 map_low_mmrs();
863 894
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d33ac1d..47703aed74cf 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@ void foo(void)
68 68
69 /* Offset from the sysenter stack to tss.sp0 */ 69 /* Offset from the sysenter stack to tss.sp0 */
70 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 70 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
71 sizeof(struct tss_struct)); 71 offsetofend(struct tss_struct, SYSENTER_stack));
72 72
73#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 73#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
74 BLANK(); 74 BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d27c9f..5ce6f2da8763 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@ int main(void)
81#undef ENTRY 81#undef ENTRY
82 82
83 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 83 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
84 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
84 BLANK(); 85 BLANK();
85 86
86 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); 87 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..9bff68798836 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
39endif 39endif
40obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o 40obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
41obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 41obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
42obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o 42obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
43obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
43 44
44obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ 45obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
45 perf_event_intel_uncore_snb.o \ 46 perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..fd470ebf924e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/io.h> 6#include <linux/io.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/random.h>
8#include <asm/processor.h> 9#include <asm/processor.h>
9#include <asm/apic.h> 10#include <asm/apic.h>
10#include <asm/cpu.h> 11#include <asm/cpu.h>
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
488 489
489 va_align.mask = (upperbit - 1) & PAGE_MASK; 490 va_align.mask = (upperbit - 1) & PAGE_MASK;
490 va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; 491 va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
492
493 /* A random value per boot for bit slice [12:upper_bit) */
494 va_align.bits = get_random_int() & va_align.mask;
491 } 495 }
492} 496}
493 497
@@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c)
711 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); 715 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
712 716
713 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 717 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
718
719 /* 3DNow or LM implies PREFETCHW */
720 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
721 if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
722 set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
714} 723}
715 724
716#ifdef CONFIG_X86_32 725#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95c6ab1..a62cf04dac8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
646 c->x86_capability[10] = eax; 646 c->x86_capability[10] = eax;
647 } 647 }
648 648
649 /* Additional Intel-defined flags: level 0x0000000F */
650 if (c->cpuid_level >= 0x0000000F) {
651 u32 eax, ebx, ecx, edx;
652
653 /* QoS sub-leaf, EAX=0Fh, ECX=0 */
654 cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
655 c->x86_capability[11] = edx;
656 if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
657 /* will be overridden if occupancy monitoring exists */
658 c->x86_cache_max_rmid = ebx;
659
660 /* QoS sub-leaf, EAX=0Fh, ECX=1 */
661 cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
662 c->x86_capability[12] = edx;
663 if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
664 c->x86_cache_max_rmid = ecx;
665 c->x86_cache_occ_scale = ebx;
666 }
667 } else {
668 c->x86_cache_max_rmid = -1;
669 c->x86_cache_occ_scale = -1;
670 }
671 }
672
649 /* AMD-defined flags: level 0x80000001 */ 673 /* AMD-defined flags: level 0x80000001 */
650 xlvl = cpuid_eax(0x80000000); 674 xlvl = cpuid_eax(0x80000000);
651 c->extended_cpuid_level = xlvl; 675 c->extended_cpuid_level = xlvl;
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
834 detect_nopl(c); 858 detect_nopl(c);
835} 859}
836 860
861static void x86_init_cache_qos(struct cpuinfo_x86 *c)
862{
863 /*
864 * The heavy lifting of max_rmid and cache_occ_scale are handled
865 * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
866 * in case CQM bits really aren't there in this CPU.
867 */
868 if (c != &boot_cpu_data) {
869 boot_cpu_data.x86_cache_max_rmid =
870 min(boot_cpu_data.x86_cache_max_rmid,
871 c->x86_cache_max_rmid);
872 }
873}
874
837/* 875/*
838 * This does the hard work of actually picking apart the CPU stuff... 876 * This does the hard work of actually picking apart the CPU stuff...
839 */ 877 */
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
923 961
924 init_hypervisor(c); 962 init_hypervisor(c);
925 x86_init_rdrand(c); 963 x86_init_rdrand(c);
964 x86_init_cache_qos(c);
926 965
927 /* 966 /*
928 * Clear/Set all flags overriden by options, need do it 967 * Clear/Set all flags overriden by options, need do it
@@ -959,38 +998,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
959#endif 998#endif
960} 999}
961 1000
962#ifdef CONFIG_X86_64 1001/*
963#ifdef CONFIG_IA32_EMULATION 1002 * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
964/* May not be __init: called during resume */ 1003 * on 32-bit kernels:
965static void syscall32_cpu_init(void) 1004 */
966{
967 /* Load these always in case some future AMD CPU supports
968 SYSENTER from compat mode too. */
969 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
970 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
971 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
972
973 wrmsrl(MSR_CSTAR, ia32_cstar_target);
974}
975#endif /* CONFIG_IA32_EMULATION */
976#endif /* CONFIG_X86_64 */
977
978#ifdef CONFIG_X86_32 1005#ifdef CONFIG_X86_32
979void enable_sep_cpu(void) 1006void enable_sep_cpu(void)
980{ 1007{
981 int cpu = get_cpu(); 1008 struct tss_struct *tss;
982 struct tss_struct *tss = &per_cpu(init_tss, cpu); 1009 int cpu;
983 1010
984 if (!boot_cpu_has(X86_FEATURE_SEP)) { 1011 cpu = get_cpu();
985 put_cpu(); 1012 tss = &per_cpu(cpu_tss, cpu);
986 return; 1013
987 } 1014 if (!boot_cpu_has(X86_FEATURE_SEP))
1015 goto out;
1016
1017 /*
1018 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
1019 * see the big comment in struct x86_hw_tss's definition.
1020 */
988 1021
989 tss->x86_tss.ss1 = __KERNEL_CS; 1022 tss->x86_tss.ss1 = __KERNEL_CS;
990 tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; 1023 wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
991 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 1024
992 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); 1025 wrmsr(MSR_IA32_SYSENTER_ESP,
993 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); 1026 (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
1027 0);
1028
1029 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
1030
1031out:
994 put_cpu(); 1032 put_cpu();
995} 1033}
996#endif 1034#endif
@@ -1118,7 +1156,7 @@ static __init int setup_disablecpuid(char *arg)
1118__setup("clearcpuid=", setup_disablecpuid); 1156__setup("clearcpuid=", setup_disablecpuid);
1119 1157
1120DEFINE_PER_CPU(unsigned long, kernel_stack) = 1158DEFINE_PER_CPU(unsigned long, kernel_stack) =
1121 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; 1159 (unsigned long)&init_thread_union + THREAD_SIZE;
1122EXPORT_PER_CPU_SYMBOL(kernel_stack); 1160EXPORT_PER_CPU_SYMBOL(kernel_stack);
1123 1161
1124#ifdef CONFIG_X86_64 1162#ifdef CONFIG_X86_64
@@ -1130,8 +1168,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
1130 irq_stack_union) __aligned(PAGE_SIZE) __visible; 1168 irq_stack_union) __aligned(PAGE_SIZE) __visible;
1131 1169
1132/* 1170/*
1133 * The following four percpu variables are hot. Align current_task to 1171 * The following percpu variables are hot. Align current_task to
1134 * cacheline size such that all four fall in the same cacheline. 1172 * cacheline size such that they fall in the same cacheline.
1135 */ 1173 */
1136DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = 1174DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
1137 &init_task; 1175 &init_task;
@@ -1171,10 +1209,23 @@ void syscall_init(void)
1171 */ 1209 */
1172 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); 1210 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1173 wrmsrl(MSR_LSTAR, system_call); 1211 wrmsrl(MSR_LSTAR, system_call);
1174 wrmsrl(MSR_CSTAR, ignore_sysret);
1175 1212
1176#ifdef CONFIG_IA32_EMULATION 1213#ifdef CONFIG_IA32_EMULATION
1177 syscall32_cpu_init(); 1214 wrmsrl(MSR_CSTAR, ia32_cstar_target);
1215 /*
1216 * This only works on Intel CPUs.
1217 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
1218 * This does not cause SYSENTER to jump to the wrong location, because
1219 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
1220 */
1221 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
1222 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1223 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
1224#else
1225 wrmsrl(MSR_CSTAR, ignore_sysret);
1226 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
1227 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1228 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
1178#endif 1229#endif
1179 1230
1180 /* Flags to clear on syscall */ 1231 /* Flags to clear on syscall */
@@ -1226,6 +1277,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1226EXPORT_PER_CPU_SYMBOL(__preempt_count); 1277EXPORT_PER_CPU_SYMBOL(__preempt_count);
1227DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1278DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1228 1279
1280/*
1281 * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
1282 * the top of the kernel stack. Use an extra percpu variable to track the
1283 * top of the kernel stack directly.
1284 */
1285DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
1286 (unsigned long)&init_thread_union + THREAD_SIZE;
1287EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
1288
1229#ifdef CONFIG_CC_STACKPROTECTOR 1289#ifdef CONFIG_CC_STACKPROTECTOR
1230DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1290DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
1231#endif 1291#endif
@@ -1307,7 +1367,7 @@ void cpu_init(void)
1307 */ 1367 */
1308 load_ucode_ap(); 1368 load_ucode_ap();
1309 1369
1310 t = &per_cpu(init_tss, cpu); 1370 t = &per_cpu(cpu_tss, cpu);
1311 oist = &per_cpu(orig_ist, cpu); 1371 oist = &per_cpu(orig_ist, cpu);
1312 1372
1313#ifdef CONFIG_NUMA 1373#ifdef CONFIG_NUMA
@@ -1391,7 +1451,7 @@ void cpu_init(void)
1391{ 1451{
1392 int cpu = smp_processor_id(); 1452 int cpu = smp_processor_id();
1393 struct task_struct *curr = current; 1453 struct task_struct *curr = current;
1394 struct tss_struct *t = &per_cpu(init_tss, cpu); 1454 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
1395 struct thread_struct *thread = &curr->thread; 1455 struct thread_struct *thread = &curr->thread;
1396 1456
1397 wait_for_master_cpu(cpu); 1457 wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 659643376dbf..edcb0e28c336 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -7,16 +7,14 @@
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
9 9
10#include <linux/init.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/device.h> 11#include <linux/cacheinfo.h>
13#include <linux/compiler.h>
14#include <linux/cpu.h> 12#include <linux/cpu.h>
15#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/sysfs.h>
16#include <linux/pci.h> 15#include <linux/pci.h>
17 16
18#include <asm/processor.h> 17#include <asm/processor.h>
19#include <linux/smp.h>
20#include <asm/amd_nb.h> 18#include <asm/amd_nb.h>
21#include <asm/smp.h> 19#include <asm/smp.h>
22 20
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] =
116 114
117 115
118enum _cache_type { 116enum _cache_type {
119 CACHE_TYPE_NULL = 0, 117 CTYPE_NULL = 0,
120 CACHE_TYPE_DATA = 1, 118 CTYPE_DATA = 1,
121 CACHE_TYPE_INST = 2, 119 CTYPE_INST = 2,
122 CACHE_TYPE_UNIFIED = 3 120 CTYPE_UNIFIED = 3
123}; 121};
124 122
125union _cpuid4_leaf_eax { 123union _cpuid4_leaf_eax {
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs {
159 struct amd_northbridge *nb; 157 struct amd_northbridge *nb;
160}; 158};
161 159
162struct _cpuid4_info {
163 struct _cpuid4_info_regs base;
164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
165};
166
167unsigned short num_cache_leaves; 160unsigned short num_cache_leaves;
168 161
169/* AMD doesn't have CPUID4. Emulate it here to report the same 162/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = {
220static const unsigned char levels[] = { 1, 1, 2, 3 }; 213static const unsigned char levels[] = { 1, 1, 2, 3 };
221static const unsigned char types[] = { 1, 2, 3, 3 }; 214static const unsigned char types[] = { 1, 2, 3, 3 };
222 215
216static const enum cache_type cache_type_map[] = {
217 [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
218 [CTYPE_DATA] = CACHE_TYPE_DATA,
219 [CTYPE_INST] = CACHE_TYPE_INST,
220 [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
221};
222
223static void 223static void
224amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 224amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
225 union _cpuid4_leaf_ebx *ebx, 225 union _cpuid4_leaf_ebx *ebx,
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
291 (ebx->split.ways_of_associativity + 1) - 1; 291 (ebx->split.ways_of_associativity + 1) - 1;
292} 292}
293 293
294struct _cache_attr {
295 struct attribute attr;
296 ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
297 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
298 unsigned int);
299};
300
301#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) 294#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
295
302/* 296/*
303 * L3 cache descriptors 297 * L3 cache descriptors
304 */ 298 */
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
325 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 319 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
326} 320}
327 321
328static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
329{
330 int node;
331
332 /* only for L3, and not in virtualized environments */
333 if (index < 3)
334 return;
335
336 node = amd_get_nb_id(smp_processor_id());
337 this_leaf->nb = node_to_amd_nb(node);
338 if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
339 amd_calc_l3_indices(this_leaf->nb);
340}
341
342/* 322/*
343 * check whether a slot used for disabling an L3 index is occupied. 323 * check whether a slot used for disabling an L3 index is occupied.
344 * @l3: L3 cache descriptor 324 * @l3: L3 cache descriptor
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
359 return -1; 339 return -1;
360} 340}
361 341
362static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 342static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
363 unsigned int slot) 343 unsigned int slot)
364{ 344{
365 int index; 345 int index;
346 struct amd_northbridge *nb = this_leaf->priv;
366 347
367 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 348 index = amd_get_l3_disable_slot(nb, slot);
368 return -EINVAL;
369
370 index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
371 if (index >= 0) 349 if (index >= 0)
372 return sprintf(buf, "%d\n", index); 350 return sprintf(buf, "%d\n", index);
373 351
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
376 354
377#define SHOW_CACHE_DISABLE(slot) \ 355#define SHOW_CACHE_DISABLE(slot) \
378static ssize_t \ 356static ssize_t \
379show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ 357cache_disable_##slot##_show(struct device *dev, \
380 unsigned int cpu) \ 358 struct device_attribute *attr, char *buf) \
381{ \ 359{ \
360 struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
382 return show_cache_disable(this_leaf, buf, slot); \ 361 return show_cache_disable(this_leaf, buf, slot); \
383} 362}
384SHOW_CACHE_DISABLE(0) 363SHOW_CACHE_DISABLE(0)
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
446 return 0; 425 return 0;
447} 426}
448 427
449static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 428static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
450 const char *buf, size_t count, 429 const char *buf, size_t count,
451 unsigned int slot) 430 unsigned int slot)
452{ 431{
453 unsigned long val = 0; 432 unsigned long val = 0;
454 int cpu, err = 0; 433 int cpu, err = 0;
434 struct amd_northbridge *nb = this_leaf->priv;
455 435
456 if (!capable(CAP_SYS_ADMIN)) 436 if (!capable(CAP_SYS_ADMIN))
457 return -EPERM; 437 return -EPERM;
458 438
459 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 439 cpu = cpumask_first(&this_leaf->shared_cpu_map);
460 return -EINVAL;
461
462 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
463 440
464 if (kstrtoul(buf, 10, &val) < 0) 441 if (kstrtoul(buf, 10, &val) < 0)
465 return -EINVAL; 442 return -EINVAL;
466 443
467 err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); 444 err = amd_set_l3_disable_slot(nb, cpu, slot, val);
468 if (err) { 445 if (err) {
469 if (err == -EEXIST) 446 if (err == -EEXIST)
470 pr_warning("L3 slot %d in use/index already disabled!\n", 447 pr_warning("L3 slot %d in use/index already disabled!\n",
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
476 453
477#define STORE_CACHE_DISABLE(slot) \ 454#define STORE_CACHE_DISABLE(slot) \
478static ssize_t \ 455static ssize_t \
479store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 456cache_disable_##slot##_store(struct device *dev, \
480 const char *buf, size_t count, \ 457 struct device_attribute *attr, \
481 unsigned int cpu) \ 458 const char *buf, size_t count) \
482{ \ 459{ \
460 struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
483 return store_cache_disable(this_leaf, buf, count, slot); \ 461 return store_cache_disable(this_leaf, buf, count, slot); \
484} 462}
485STORE_CACHE_DISABLE(0) 463STORE_CACHE_DISABLE(0)
486STORE_CACHE_DISABLE(1) 464STORE_CACHE_DISABLE(1)
487 465
488static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 466static ssize_t subcaches_show(struct device *dev,
489 show_cache_disable_0, store_cache_disable_0); 467 struct device_attribute *attr, char *buf)
490static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
491 show_cache_disable_1, store_cache_disable_1);
492
493static ssize_t
494show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
495{ 468{
496 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) 469 struct cacheinfo *this_leaf = dev_get_drvdata(dev);
497 return -EINVAL; 470 int cpu = cpumask_first(&this_leaf->shared_cpu_map);
498 471
499 return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); 472 return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
500} 473}
501 474
502static ssize_t 475static ssize_t subcaches_store(struct device *dev,
503store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, 476 struct device_attribute *attr,
504 unsigned int cpu) 477 const char *buf, size_t count)
505{ 478{
479 struct cacheinfo *this_leaf = dev_get_drvdata(dev);
480 int cpu = cpumask_first(&this_leaf->shared_cpu_map);
506 unsigned long val; 481 unsigned long val;
507 482
508 if (!capable(CAP_SYS_ADMIN)) 483 if (!capable(CAP_SYS_ADMIN))
509 return -EPERM; 484 return -EPERM;
510 485
511 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
512 return -EINVAL;
513
514 if (kstrtoul(buf, 16, &val) < 0) 486 if (kstrtoul(buf, 16, &val) < 0)
515 return -EINVAL; 487 return -EINVAL;
516 488
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
520 return count; 492 return count;
521} 493}
522 494
523static struct _cache_attr subcaches = 495static DEVICE_ATTR_RW(cache_disable_0);
524 __ATTR(subcaches, 0644, show_subcaches, store_subcaches); 496static DEVICE_ATTR_RW(cache_disable_1);
497static DEVICE_ATTR_RW(subcaches);
498
499static umode_t
500cache_private_attrs_is_visible(struct kobject *kobj,
501 struct attribute *attr, int unused)
502{
503 struct device *dev = kobj_to_dev(kobj);
504 struct cacheinfo *this_leaf = dev_get_drvdata(dev);
505 umode_t mode = attr->mode;
506
507 if (!this_leaf->priv)
508 return 0;
509
510 if ((attr == &dev_attr_subcaches.attr) &&
511 amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
512 return mode;
513
514 if ((attr == &dev_attr_cache_disable_0.attr ||
515 attr == &dev_attr_cache_disable_1.attr) &&
516 amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
517 return mode;
518
519 return 0;
520}
521
522static struct attribute_group cache_private_group = {
523 .is_visible = cache_private_attrs_is_visible,
524};
525
526static void init_amd_l3_attrs(void)
527{
528 int n = 1;
529 static struct attribute **amd_l3_attrs;
530
531 if (amd_l3_attrs) /* already initialized */
532 return;
533
534 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
535 n += 2;
536 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
537 n += 1;
538
539 amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
540 if (!amd_l3_attrs)
541 return;
542
543 n = 0;
544 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
545 amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
546 amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
547 }
548 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
549 amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
525 550
551 cache_private_group.attrs = amd_l3_attrs;
552}
553
554const struct attribute_group *
555cache_get_priv_group(struct cacheinfo *this_leaf)
556{
557 struct amd_northbridge *nb = this_leaf->priv;
558
559 if (this_leaf->level < 3 || !nb)
560 return NULL;
561
562 if (nb && nb->l3_cache.indices)
563 init_amd_l3_attrs();
564
565 return &cache_private_group;
566}
567
568static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
569{
570 int node;
571
572 /* only for L3, and not in virtualized environments */
573 if (index < 3)
574 return;
575
576 node = amd_get_nb_id(smp_processor_id());
577 this_leaf->nb = node_to_amd_nb(node);
578 if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
579 amd_calc_l3_indices(this_leaf->nb);
580}
526#else 581#else
527#define amd_init_l3_cache(x, y) 582#define amd_init_l3_cache(x, y)
528#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ 583#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
546 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 601 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
547 } 602 }
548 603
549 if (eax.split.type == CACHE_TYPE_NULL) 604 if (eax.split.type == CTYPE_NULL)
550 return -EIO; /* better error ? */ 605 return -EIO; /* better error ? */
551 606
552 this_leaf->eax = eax; 607 this_leaf->eax = eax;
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
575 /* Do cpuid(op) loop to find out num_cache_leaves */ 630 /* Do cpuid(op) loop to find out num_cache_leaves */
576 cpuid_count(op, i, &eax, &ebx, &ecx, &edx); 631 cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
577 cache_eax.full = eax; 632 cache_eax.full = eax;
578 } while (cache_eax.split.type != CACHE_TYPE_NULL); 633 } while (cache_eax.split.type != CTYPE_NULL);
579 return i; 634 return i;
580} 635}
581 636
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
626 681
627 switch (this_leaf.eax.split.level) { 682 switch (this_leaf.eax.split.level) {
628 case 1: 683 case 1:
629 if (this_leaf.eax.split.type == CACHE_TYPE_DATA) 684 if (this_leaf.eax.split.type == CTYPE_DATA)
630 new_l1d = this_leaf.size/1024; 685 new_l1d = this_leaf.size/1024;
631 else if (this_leaf.eax.split.type == CACHE_TYPE_INST) 686 else if (this_leaf.eax.split.type == CTYPE_INST)
632 new_l1i = this_leaf.size/1024; 687 new_l1i = this_leaf.size/1024;
633 break; 688 break;
634 case 2: 689 case 2:
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
747 return l2; 802 return l2;
748} 803}
749 804
750#ifdef CONFIG_SYSFS 805static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
751 806 struct _cpuid4_info_regs *base)
752/* pointer to _cpuid4_info array (for each cache leaf) */
753static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
754#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
755
756#ifdef CONFIG_SMP
757
758static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
759{ 807{
760 struct _cpuid4_info *this_leaf; 808 struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
809 struct cacheinfo *this_leaf;
761 int i, sibling; 810 int i, sibling;
762 811
763 if (cpu_has_topoext) { 812 if (cpu_has_topoext) {
764 unsigned int apicid, nshared, first, last; 813 unsigned int apicid, nshared, first, last;
765 814
766 if (!per_cpu(ici_cpuid4_info, cpu)) 815 this_leaf = this_cpu_ci->info_list + index;
767 return 0; 816 nshared = base->eax.split.num_threads_sharing + 1;
768
769 this_leaf = CPUID4_INFO_IDX(cpu, index);
770 nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
771 apicid = cpu_data(cpu).apicid; 817 apicid = cpu_data(cpu).apicid;
772 first = apicid - (apicid % nshared); 818 first = apicid - (apicid % nshared);
773 last = first + nshared - 1; 819 last = first + nshared - 1;
774 820
775 for_each_online_cpu(i) { 821 for_each_online_cpu(i) {
822 this_cpu_ci = get_cpu_cacheinfo(i);
823 if (!this_cpu_ci->info_list)
824 continue;
825
776 apicid = cpu_data(i).apicid; 826 apicid = cpu_data(i).apicid;
777 if ((apicid < first) || (apicid > last)) 827 if ((apicid < first) || (apicid > last))
778 continue; 828 continue;
779 if (!per_cpu(ici_cpuid4_info, i)) 829
780 continue; 830 this_leaf = this_cpu_ci->info_list + index;
781 this_leaf = CPUID4_INFO_IDX(i, index);
782 831
783 for_each_online_cpu(sibling) { 832 for_each_online_cpu(sibling) {
784 apicid = cpu_data(sibling).apicid; 833 apicid = cpu_data(sibling).apicid;
785 if ((apicid < first) || (apicid > last)) 834 if ((apicid < first) || (apicid > last))
786 continue; 835 continue;
787 set_bit(sibling, this_leaf->shared_cpu_map); 836 cpumask_set_cpu(sibling,
837 &this_leaf->shared_cpu_map);
788 } 838 }
789 } 839 }
790 } else if (index == 3) { 840 } else if (index == 3) {
791 for_each_cpu(i, cpu_llc_shared_mask(cpu)) { 841 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
792 if (!per_cpu(ici_cpuid4_info, i)) 842 this_cpu_ci = get_cpu_cacheinfo(i);
843 if (!this_cpu_ci->info_list)
793 continue; 844 continue;
794 this_leaf = CPUID4_INFO_IDX(i, index); 845 this_leaf = this_cpu_ci->info_list + index;
795 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { 846 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
796 if (!cpu_online(sibling)) 847 if (!cpu_online(sibling))
797 continue; 848 continue;
798 set_bit(sibling, this_leaf->shared_cpu_map); 849 cpumask_set_cpu(sibling,
850 &this_leaf->shared_cpu_map);
799 } 851 }
800 } 852 }
801 } else 853 } else
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
804 return 1; 856 return 1;
805} 857}
806 858
807static void cache_shared_cpu_map_setup(unsigned int cpu, int index) 859static void __cache_cpumap_setup(unsigned int cpu, int index,
860 struct _cpuid4_info_regs *base)
808{ 861{
809 struct _cpuid4_info *this_leaf, *sibling_leaf; 862 struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
863 struct cacheinfo *this_leaf, *sibling_leaf;
810 unsigned long num_threads_sharing; 864 unsigned long num_threads_sharing;
811 int index_msb, i; 865 int index_msb, i;
812 struct cpuinfo_x86 *c = &cpu_data(cpu); 866 struct cpuinfo_x86 *c = &cpu_data(cpu);
813 867
814 if (c->x86_vendor == X86_VENDOR_AMD) { 868 if (c->x86_vendor == X86_VENDOR_AMD) {
815 if (cache_shared_amd_cpu_map_setup(cpu, index)) 869 if (__cache_amd_cpumap_setup(cpu, index, base))
816 return; 870 return;
817 } 871 }
818 872
819 this_leaf = CPUID4_INFO_IDX(cpu, index); 873 this_leaf = this_cpu_ci->info_list + index;
820 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; 874 num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
821 875
876 cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
822 if (num_threads_sharing == 1) 877 if (num_threads_sharing == 1)
823 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); 878 return;
824 else {
825 index_msb = get_count_order(num_threads_sharing);
826
827 for_each_online_cpu(i) {
828 if (cpu_data(i).apicid >> index_msb ==
829 c->apicid >> index_msb) {
830 cpumask_set_cpu(i,
831 to_cpumask(this_leaf->shared_cpu_map));
832 if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
833 sibling_leaf =
834 CPUID4_INFO_IDX(i, index);
835 cpumask_set_cpu(cpu, to_cpumask(
836 sibling_leaf->shared_cpu_map));
837 }
838 }
839 }
840 }
841}
842static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
843{
844 struct _cpuid4_info *this_leaf, *sibling_leaf;
845 int sibling;
846
847 this_leaf = CPUID4_INFO_IDX(cpu, index);
848 for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
849 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
850 cpumask_clear_cpu(cpu,
851 to_cpumask(sibling_leaf->shared_cpu_map));
852 }
853}
854#else
855static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
856{
857}
858
859static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
860{
861}
862#endif
863
864static void free_cache_attributes(unsigned int cpu)
865{
866 int i;
867
868 for (i = 0; i < num_cache_leaves; i++)
869 cache_remove_shared_cpu_map(cpu, i);
870
871 kfree(per_cpu(ici_cpuid4_info, cpu));
872 per_cpu(ici_cpuid4_info, cpu) = NULL;
873}
874
875static void get_cpu_leaves(void *_retval)
876{
877 int j, *retval = _retval, cpu = smp_processor_id();
878 879
879 /* Do cpuid and store the results */ 880 index_msb = get_count_order(num_threads_sharing);
880 for (j = 0; j < num_cache_leaves; j++) {
881 struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
882 881
883 *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); 882 for_each_online_cpu(i)
884 if (unlikely(*retval < 0)) { 883 if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
885 int i; 884 struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
886 885
887 for (i = 0; i < j; i++) 886 if (i == cpu || !sib_cpu_ci->info_list)
888 cache_remove_shared_cpu_map(cpu, i); 887 continue;/* skip if itself or no cacheinfo */
889 break; 888 sibling_leaf = sib_cpu_ci->info_list + index;
889 cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
890 cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
890 } 891 }
891 cache_shared_cpu_map_setup(cpu, j);
892 }
893} 892}
894 893
895static int detect_cache_attributes(unsigned int cpu) 894static void ci_leaf_init(struct cacheinfo *this_leaf,
895 struct _cpuid4_info_regs *base)
896{ 896{
897 int retval; 897 this_leaf->level = base->eax.split.level;
898 898 this_leaf->type = cache_type_map[base->eax.split.type];
899 if (num_cache_leaves == 0) 899 this_leaf->coherency_line_size =
900 return -ENOENT; 900 base->ebx.split.coherency_line_size + 1;
901 901 this_leaf->ways_of_associativity =
902 per_cpu(ici_cpuid4_info, cpu) = kzalloc( 902 base->ebx.split.ways_of_associativity + 1;
903 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 903 this_leaf->size = base->size;
904 if (per_cpu(ici_cpuid4_info, cpu) == NULL) 904 this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
905 return -ENOMEM; 905 this_leaf->physical_line_partition =
906 906 base->ebx.split.physical_line_partition + 1;
907 smp_call_function_single(cpu, get_cpu_leaves, &retval, true); 907 this_leaf->priv = base->nb;
908 if (retval) {
909 kfree(per_cpu(ici_cpuid4_info, cpu));
910 per_cpu(ici_cpuid4_info, cpu) = NULL;
911 }
912
913 return retval;
914} 908}
915 909
916#include <linux/kobject.h> 910static int __init_cache_level(unsigned int cpu)
917#include <linux/sysfs.h>
918#include <linux/cpu.h>
919
920/* pointer to kobject for cpuX/cache */
921static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
922
923struct _index_kobject {
924 struct kobject kobj;
925 unsigned int cpu;
926 unsigned short index;
927};
928
929/* pointer to array of kobjects for cpuX/cache/indexY */
930static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
931#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
932
933#define show_one_plus(file_name, object, val) \
934static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
935 unsigned int cpu) \
936{ \
937 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
938}
939
940show_one_plus(level, base.eax.split.level, 0);
941show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
942show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
943show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
944show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
945
946static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
947 unsigned int cpu)
948{
949 return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
950}
951
952static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
953 int type, char *buf)
954{
955 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
956 int ret;
957
958 if (type)
959 ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
960 cpumask_pr_args(mask));
961 else
962 ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
963 cpumask_pr_args(mask));
964 buf[ret++] = '\n';
965 buf[ret] = '\0';
966 return ret;
967}
968
969static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
970 unsigned int cpu)
971{ 911{
972 return show_shared_cpu_map_func(leaf, 0, buf); 912 struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
973}
974
975static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
976 unsigned int cpu)
977{
978 return show_shared_cpu_map_func(leaf, 1, buf);
979}
980 913
981static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, 914 if (!num_cache_leaves)
982 unsigned int cpu)
983{
984 switch (this_leaf->base.eax.split.type) {
985 case CACHE_TYPE_DATA:
986 return sprintf(buf, "Data\n");
987 case CACHE_TYPE_INST:
988 return sprintf(buf, "Instruction\n");
989 case CACHE_TYPE_UNIFIED:
990 return sprintf(buf, "Unified\n");
991 default:
992 return sprintf(buf, "Unknown\n");
993 }
994}
995
996#define to_object(k) container_of(k, struct _index_kobject, kobj)
997#define to_attr(a) container_of(a, struct _cache_attr, attr)
998
999#define define_one_ro(_name) \
1000static struct _cache_attr _name = \
1001 __ATTR(_name, 0444, show_##_name, NULL)
1002
1003define_one_ro(level);
1004define_one_ro(type);
1005define_one_ro(coherency_line_size);
1006define_one_ro(physical_line_partition);
1007define_one_ro(ways_of_associativity);
1008define_one_ro(number_of_sets);
1009define_one_ro(size);
1010define_one_ro(shared_cpu_map);
1011define_one_ro(shared_cpu_list);
1012
1013static struct attribute *default_attrs[] = {
1014 &type.attr,
1015 &level.attr,
1016 &coherency_line_size.attr,
1017 &physical_line_partition.attr,
1018 &ways_of_associativity.attr,
1019 &number_of_sets.attr,
1020 &size.attr,
1021 &shared_cpu_map.attr,
1022 &shared_cpu_list.attr,
1023 NULL
1024};
1025
1026#ifdef CONFIG_AMD_NB
1027static struct attribute **amd_l3_attrs(void)
1028{
1029 static struct attribute **attrs;
1030 int n;
1031
1032 if (attrs)
1033 return attrs;
1034
1035 n = ARRAY_SIZE(default_attrs);
1036
1037 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
1038 n += 2;
1039
1040 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1041 n += 1;
1042
1043 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
1044 if (attrs == NULL)
1045 return attrs = default_attrs;
1046
1047 for (n = 0; default_attrs[n]; n++)
1048 attrs[n] = default_attrs[n];
1049
1050 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
1051 attrs[n++] = &cache_disable_0.attr;
1052 attrs[n++] = &cache_disable_1.attr;
1053 }
1054
1055 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1056 attrs[n++] = &subcaches.attr;
1057
1058 return attrs;
1059}
1060#endif
1061
1062static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
1063{
1064 struct _cache_attr *fattr = to_attr(attr);
1065 struct _index_kobject *this_leaf = to_object(kobj);
1066 ssize_t ret;
1067
1068 ret = fattr->show ?
1069 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1070 buf, this_leaf->cpu) :
1071 0;
1072 return ret;
1073}
1074
1075static ssize_t store(struct kobject *kobj, struct attribute *attr,
1076 const char *buf, size_t count)
1077{
1078 struct _cache_attr *fattr = to_attr(attr);
1079 struct _index_kobject *this_leaf = to_object(kobj);
1080 ssize_t ret;
1081
1082 ret = fattr->store ?
1083 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1084 buf, count, this_leaf->cpu) :
1085 0;
1086 return ret;
1087}
1088
1089static const struct sysfs_ops sysfs_ops = {
1090 .show = show,
1091 .store = store,
1092};
1093
1094static struct kobj_type ktype_cache = {
1095 .sysfs_ops = &sysfs_ops,
1096 .default_attrs = default_attrs,
1097};
1098
1099static struct kobj_type ktype_percpu_entry = {
1100 .sysfs_ops = &sysfs_ops,
1101};
1102
1103static void cpuid4_cache_sysfs_exit(unsigned int cpu)
1104{
1105 kfree(per_cpu(ici_cache_kobject, cpu));
1106 kfree(per_cpu(ici_index_kobject, cpu));
1107 per_cpu(ici_cache_kobject, cpu) = NULL;
1108 per_cpu(ici_index_kobject, cpu) = NULL;
1109 free_cache_attributes(cpu);
1110}
1111
1112static int cpuid4_cache_sysfs_init(unsigned int cpu)
1113{
1114 int err;
1115
1116 if (num_cache_leaves == 0)
1117 return -ENOENT; 915 return -ENOENT;
1118 916 if (!this_cpu_ci)
1119 err = detect_cache_attributes(cpu); 917 return -EINVAL;
1120 if (err) 918 this_cpu_ci->num_levels = 3;
1121 return err; 919 this_cpu_ci->num_leaves = num_cache_leaves;
1122
1123 /* Allocate all required memory */
1124 per_cpu(ici_cache_kobject, cpu) =
1125 kzalloc(sizeof(struct kobject), GFP_KERNEL);
1126 if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
1127 goto err_out;
1128
1129 per_cpu(ici_index_kobject, cpu) = kzalloc(
1130 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
1131 if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
1132 goto err_out;
1133
1134 return 0; 920 return 0;
1135
1136err_out:
1137 cpuid4_cache_sysfs_exit(cpu);
1138 return -ENOMEM;
1139} 921}
1140 922
1141static DECLARE_BITMAP(cache_dev_map, NR_CPUS); 923static int __populate_cache_leaves(unsigned int cpu)
1142
1143/* Add/Remove cache interface for CPU device */
1144static int cache_add_dev(struct device *dev)
1145{ 924{
1146 unsigned int cpu = dev->id; 925 unsigned int idx, ret;
1147 unsigned long i, j; 926 struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
1148 struct _index_kobject *this_object; 927 struct cacheinfo *this_leaf = this_cpu_ci->info_list;
1149 struct _cpuid4_info *this_leaf; 928 struct _cpuid4_info_regs id4_regs = {};
1150 int retval;
1151
1152 retval = cpuid4_cache_sysfs_init(cpu);
1153 if (unlikely(retval < 0))
1154 return retval;
1155
1156 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
1157 &ktype_percpu_entry,
1158 &dev->kobj, "%s", "cache");
1159 if (retval < 0) {
1160 cpuid4_cache_sysfs_exit(cpu);
1161 return retval;
1162 }
1163 929
1164 for (i = 0; i < num_cache_leaves; i++) { 930 for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
1165 this_object = INDEX_KOBJECT_PTR(cpu, i); 931 ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
1166 this_object->cpu = cpu; 932 if (ret)
1167 this_object->index = i; 933 return ret;
1168 934 ci_leaf_init(this_leaf++, &id4_regs);
1169 this_leaf = CPUID4_INFO_IDX(cpu, i); 935 __cache_cpumap_setup(cpu, idx, &id4_regs);
1170
1171 ktype_cache.default_attrs = default_attrs;
1172#ifdef CONFIG_AMD_NB
1173 if (this_leaf->base.nb)
1174 ktype_cache.default_attrs = amd_l3_attrs();
1175#endif
1176 retval = kobject_init_and_add(&(this_object->kobj),
1177 &ktype_cache,
1178 per_cpu(ici_cache_kobject, cpu),
1179 "index%1lu", i);
1180 if (unlikely(retval)) {
1181 for (j = 0; j < i; j++)
1182 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
1183 kobject_put(per_cpu(ici_cache_kobject, cpu));
1184 cpuid4_cache_sysfs_exit(cpu);
1185 return retval;
1186 }
1187 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
1188 } 936 }
1189 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
1190
1191 kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
1192 return 0; 937 return 0;
1193} 938}
1194 939
1195static void cache_remove_dev(struct device *dev) 940DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
1196{ 941DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
1197 unsigned int cpu = dev->id;
1198 unsigned long i;
1199
1200 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
1201 return;
1202 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
1203 return;
1204 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
1205
1206 for (i = 0; i < num_cache_leaves; i++)
1207 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
1208 kobject_put(per_cpu(ici_cache_kobject, cpu));
1209 cpuid4_cache_sysfs_exit(cpu);
1210}
1211
1212static int cacheinfo_cpu_callback(struct notifier_block *nfb,
1213 unsigned long action, void *hcpu)
1214{
1215 unsigned int cpu = (unsigned long)hcpu;
1216 struct device *dev;
1217
1218 dev = get_cpu_device(cpu);
1219 switch (action) {
1220 case CPU_ONLINE:
1221 case CPU_ONLINE_FROZEN:
1222 cache_add_dev(dev);
1223 break;
1224 case CPU_DEAD:
1225 case CPU_DEAD_FROZEN:
1226 cache_remove_dev(dev);
1227 break;
1228 }
1229 return NOTIFY_OK;
1230}
1231
1232static struct notifier_block cacheinfo_cpu_notifier = {
1233 .notifier_call = cacheinfo_cpu_callback,
1234};
1235
1236static int __init cache_sysfs_init(void)
1237{
1238 int i, err = 0;
1239
1240 if (num_cache_leaves == 0)
1241 return 0;
1242
1243 cpu_notifier_register_begin();
1244 for_each_online_cpu(i) {
1245 struct device *dev = get_cpu_device(i);
1246
1247 err = cache_add_dev(dev);
1248 if (err)
1249 goto out;
1250 }
1251 __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
1252
1253out:
1254 cpu_notifier_register_done();
1255 return err;
1256}
1257
1258device_initcall(cache_sysfs_init);
1259
1260#endif
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
1/*
2 * Intel(R) Processor Trace PMU driver for perf
3 * Copyright (c) 2013-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * Intel PT is specified in the Intel Architecture Instruction Set Extensions
15 * Programming Reference:
16 * http://software.intel.com/en-us/intel-isa-extensions
17 */
18
19#ifndef __INTEL_PT_H__
20#define __INTEL_PT_H__
21
22/*
23 * Single-entry ToPA: when this close to region boundary, switch
24 * buffers to avoid losing data.
25 */
26#define TOPA_PMI_MARGIN 512
27
28/*
29 * Table of Physical Addresses bits
30 */
31enum topa_sz {
32 TOPA_4K = 0,
33 TOPA_8K,
34 TOPA_16K,
35 TOPA_32K,
36 TOPA_64K,
37 TOPA_128K,
38 TOPA_256K,
39 TOPA_512K,
40 TOPA_1MB,
41 TOPA_2MB,
42 TOPA_4MB,
43 TOPA_8MB,
44 TOPA_16MB,
45 TOPA_32MB,
46 TOPA_64MB,
47 TOPA_128MB,
48 TOPA_SZ_END,
49};
50
51static inline unsigned int sizes(enum topa_sz tsz)
52{
53 return 1 << (tsz + 12);
54};
55
56struct topa_entry {
57 u64 end : 1;
58 u64 rsvd0 : 1;
59 u64 intr : 1;
60 u64 rsvd1 : 1;
61 u64 stop : 1;
62 u64 rsvd2 : 1;
63 u64 size : 4;
64 u64 rsvd3 : 2;
65 u64 base : 36;
66 u64 rsvd4 : 16;
67};
68
69#define TOPA_SHIFT 12
70#define PT_CPUID_LEAVES 2
71
72enum pt_capabilities {
73 PT_CAP_max_subleaf = 0,
74 PT_CAP_cr3_filtering,
75 PT_CAP_topa_output,
76 PT_CAP_topa_multiple_entries,
77 PT_CAP_payloads_lip,
78};
79
80struct pt_pmu {
81 struct pmu pmu;
82 u32 caps[4 * PT_CPUID_LEAVES];
83};
84
85/**
86 * struct pt_buffer - buffer configuration; one buffer per task_struct or
87 * cpu, depending on perf event configuration
88 * @cpu: cpu for per-cpu allocation
89 * @tables: list of ToPA tables in this buffer
90 * @first: shorthand for first topa table
91 * @last: shorthand for last topa table
92 * @cur: current topa table
93 * @nr_pages: buffer size in pages
94 * @cur_idx: current output region's index within @cur table
95 * @output_off: offset within the current output region
96 * @data_size: running total of the amount of data in this buffer
97 * @lost: if data was lost/truncated
98 * @head: logical write offset inside the buffer
99 * @snapshot: if this is for a snapshot/overwrite counter
100 * @stop_pos: STOP topa entry in the buffer
101 * @intr_pos: INT topa entry in the buffer
102 * @data_pages: array of pages from perf
103 * @topa_index: table of topa entries indexed by page offset
104 */
105struct pt_buffer {
106 int cpu;
107 struct list_head tables;
108 struct topa *first, *last, *cur;
109 unsigned int cur_idx;
110 size_t output_off;
111 unsigned long nr_pages;
112 local_t data_size;
113 local_t lost;
114 local64_t head;
115 bool snapshot;
116 unsigned long stop_pos, intr_pos;
117 void **data_pages;
118 struct topa_entry *topa_index[0];
119};
120
121/**
122 * struct pt - per-cpu pt context
123 * @handle: perf output handle
124 * @handle_nmi: do handle PT PMI on this cpu, there's an active event
125 */
126struct pt {
127 struct perf_output_handle handle;
128 int handle_nmi;
129};
130
131#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b46906767f..fe32074b865b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -14,6 +14,7 @@ enum severity_level {
14}; 14};
15 15
16#define ATTR_LEN 16 16#define ATTR_LEN 16
17#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
17 18
18/* One object for each MCE bank, shared by all CPUs */ 19/* One object for each MCE bank, shared by all CPUs */
19struct mce_bank { 20struct mce_bank {
@@ -23,20 +24,20 @@ struct mce_bank {
23 char attrname[ATTR_LEN]; /* attribute name */ 24 char attrname[ATTR_LEN]; /* attribute name */
24}; 25};
25 26
26int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); 27extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
27struct dentry *mce_get_debugfs_dir(void); 28struct dentry *mce_get_debugfs_dir(void);
28 29
29extern struct mce_bank *mce_banks; 30extern struct mce_bank *mce_banks;
30extern mce_banks_t mce_banks_ce_disabled; 31extern mce_banks_t mce_banks_ce_disabled;
31 32
32#ifdef CONFIG_X86_MCE_INTEL 33#ifdef CONFIG_X86_MCE_INTEL
33unsigned long mce_intel_adjust_timer(unsigned long interval); 34unsigned long cmci_intel_adjust_timer(unsigned long interval);
34void mce_intel_cmci_poll(void); 35bool mce_intel_cmci_poll(void);
35void mce_intel_hcpu_update(unsigned long cpu); 36void mce_intel_hcpu_update(unsigned long cpu);
36void cmci_disable_bank(int bank); 37void cmci_disable_bank(int bank);
37#else 38#else
38# define mce_intel_adjust_timer mce_adjust_timer_default 39# define cmci_intel_adjust_timer mce_adjust_timer_default
39static inline void mce_intel_cmci_poll(void) { } 40static inline bool mce_intel_cmci_poll(void) { return false; }
40static inline void mce_intel_hcpu_update(unsigned long cpu) { } 41static inline void mce_intel_hcpu_update(unsigned long cpu) { }
41static inline void cmci_disable_bank(int bank) { } 42static inline void cmci_disable_bank(int bank) { }
42#endif 43#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..9c682c222071 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,7 +186,61 @@ static int error_context(struct mce *m)
186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
187} 187}
188 188
189int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) 189/*
190 * See AMD Error Scope Hierarchy table in a newer BKDG. For example
191 * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
192 */
193static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
194{
195 enum context ctx = error_context(m);
196
197 /* Processor Context Corrupt, no need to fumble too much, die! */
198 if (m->status & MCI_STATUS_PCC)
199 return MCE_PANIC_SEVERITY;
200
201 if (m->status & MCI_STATUS_UC) {
202
203 /*
204 * On older systems where overflow_recov flag is not present, we
205 * should simply panic if an error overflow occurs. If
206 * overflow_recov flag is present and set, then software can try
207 * to at least kill process to prolong system operation.
208 */
209 if (mce_flags.overflow_recov) {
210 /* software can try to contain */
211 if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
212 return MCE_PANIC_SEVERITY;
213
214 /* kill current process */
215 return MCE_AR_SEVERITY;
216 } else {
217 /* at least one error was not logged */
218 if (m->status & MCI_STATUS_OVER)
219 return MCE_PANIC_SEVERITY;
220 }
221
222 /*
223 * For any other case, return MCE_UC_SEVERITY so that we log the
224 * error and exit #MC handler.
225 */
226 return MCE_UC_SEVERITY;
227 }
228
229 /*
230 * deferred error: poll handler catches these and adds to mce_ring so
231 * memory-failure can take recovery actions.
232 */
233 if (m->status & MCI_STATUS_DEFERRED)
234 return MCE_DEFERRED_SEVERITY;
235
236 /*
237 * corrected error: poll handler catches these and passes responsibility
238 * of decoding the error to EDAC
239 */
240 return MCE_KEEP_SEVERITY;
241}
242
243static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
190{ 244{
191 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); 245 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
192 enum context ctx = error_context(m); 246 enum context ctx = error_context(m);
@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
216 } 270 }
217} 271}
218 272
273/* Default to mce_severity_intel */
274int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
275 mce_severity_intel;
276
277void __init mcheck_vendor_init_severity(void)
278{
279 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
280 mce_severity = mce_severity_amd;
281}
282
219#ifdef CONFIG_DEBUG_FS 283#ifdef CONFIG_DEBUG_FS
220static void *s_start(struct seq_file *f, loff_t *pos) 284static void *s_start(struct seq_file *f, loff_t *pos)
221{ 285{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3c036cb4a370..e535533d5ab8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
60#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
61#include <trace/events/mce.h> 61#include <trace/events/mce.h>
62 62
63#define SPINUNIT 100 /* 100ns */ 63#define SPINUNIT 100 /* 100ns */
64 64
65DEFINE_PER_CPU(unsigned, mce_exception_count); 65DEFINE_PER_CPU(unsigned, mce_exception_count);
66 66
67struct mce_bank *mce_banks __read_mostly; 67struct mce_bank *mce_banks __read_mostly;
68struct mce_vendor_flags mce_flags __read_mostly;
68 69
69struct mca_config mca_cfg __read_mostly = { 70struct mca_config mca_cfg __read_mostly = {
70 .bootlog = -1, 71 .bootlog = -1,
@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
89static DEFINE_PER_CPU(struct mce, mces_seen); 90static DEFINE_PER_CPU(struct mce, mces_seen);
90static int cpu_missing; 91static int cpu_missing;
91 92
92/* CMCI storm detection filter */
93static DEFINE_PER_CPU(unsigned long, mce_polled_error);
94
95/* 93/*
96 * MCA banks polled by the period polling timer for corrected events. 94 * MCA banks polled by the period polling timer for corrected events.
97 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 95 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
622 * is already totally * confused. In this case it's likely it will 620 * is already totally * confused. In this case it's likely it will
623 * not fully execute the machine check handler either. 621 * not fully execute the machine check handler either.
624 */ 622 */
625void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 623bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
626{ 624{
625 bool error_logged = false;
627 struct mce m; 626 struct mce m;
628 int severity; 627 int severity;
629 int i; 628 int i;
@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
646 if (!(m.status & MCI_STATUS_VAL)) 645 if (!(m.status & MCI_STATUS_VAL))
647 continue; 646 continue;
648 647
649 this_cpu_write(mce_polled_error, 1); 648
650 /* 649 /*
651 * Uncorrected or signalled events are handled by the exception 650 * Uncorrected or signalled events are handled by the exception
652 * handler when it is enabled, so don't process those here. 651 * handler when it is enabled, so don't process those here.
@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
679 * Don't get the IP here because it's unlikely to 678 * Don't get the IP here because it's unlikely to
680 * have anything to do with the actual error location. 679 * have anything to do with the actual error location.
681 */ 680 */
682 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 681 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
682 error_logged = true;
683 mce_log(&m); 683 mce_log(&m);
684 }
684 685
685 /* 686 /*
686 * Clear state for this bank. 687 * Clear state for this bank.
@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
694 */ 695 */
695 696
696 sync_core(); 697 sync_core();
698
699 return error_logged;
697} 700}
698EXPORT_SYMBOL_GPL(machine_check_poll); 701EXPORT_SYMBOL_GPL(machine_check_poll);
699 702
@@ -813,7 +816,7 @@ static void mce_reign(void)
813 * other CPUs. 816 * other CPUs.
814 */ 817 */
815 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) 818 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
816 mce_panic("Fatal Machine check", m, msg); 819 mce_panic("Fatal machine check", m, msg);
817 820
818 /* 821 /*
819 * For UC somewhere we let the CPU who detects it handle it. 822 * For UC somewhere we let the CPU who detects it handle it.
@@ -826,7 +829,7 @@ static void mce_reign(void)
826 * source or one CPU is hung. Panic. 829 * source or one CPU is hung. Panic.
827 */ 830 */
828 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) 831 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
829 mce_panic("Machine check from unknown source", NULL, NULL); 832 mce_panic("Fatal machine check from unknown source", NULL, NULL);
830 833
831 /* 834 /*
832 * Now clear all the mces_seen so that they don't reappear on 835 * Now clear all the mces_seen so that they don't reappear on
@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status)
1258 * poller finds an MCE, poll 2x faster. When the poller finds no more 1261 * poller finds an MCE, poll 2x faster. When the poller finds no more
1259 * errors, poll 2x slower (up to check_interval seconds). 1262 * errors, poll 2x slower (up to check_interval seconds).
1260 */ 1263 */
1261static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1264static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1262 1265
1263static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1266static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1264static DEFINE_PER_CPU(struct timer_list, mce_timer); 1267static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
1268 return interval; 1271 return interval;
1269} 1272}
1270 1273
1271static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1274static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1272 mce_adjust_timer_default;
1273 1275
1274static int cmc_error_seen(void) 1276static void __restart_timer(struct timer_list *t, unsigned long interval)
1275{ 1277{
1276 unsigned long *v = this_cpu_ptr(&mce_polled_error); 1278 unsigned long when = jiffies + interval;
1279 unsigned long flags;
1280
1281 local_irq_save(flags);
1277 1282
1278 return test_and_clear_bit(0, v); 1283 if (timer_pending(t)) {
1284 if (time_before(when, t->expires))
1285 mod_timer_pinned(t, when);
1286 } else {
1287 t->expires = round_jiffies(when);
1288 add_timer_on(t, smp_processor_id());
1289 }
1290
1291 local_irq_restore(flags);
1279} 1292}
1280 1293
1281static void mce_timer_fn(unsigned long data) 1294static void mce_timer_fn(unsigned long data)
1282{ 1295{
1283 struct timer_list *t = this_cpu_ptr(&mce_timer); 1296 struct timer_list *t = this_cpu_ptr(&mce_timer);
1297 int cpu = smp_processor_id();
1284 unsigned long iv; 1298 unsigned long iv;
1285 int notify;
1286 1299
1287 WARN_ON(smp_processor_id() != data); 1300 WARN_ON(cpu != data);
1301
1302 iv = __this_cpu_read(mce_next_interval);
1288 1303
1289 if (mce_available(this_cpu_ptr(&cpu_info))) { 1304 if (mce_available(this_cpu_ptr(&cpu_info))) {
1290 machine_check_poll(MCP_TIMESTAMP, 1305 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1291 this_cpu_ptr(&mce_poll_banks)); 1306
1292 mce_intel_cmci_poll(); 1307 if (mce_intel_cmci_poll()) {
1308 iv = mce_adjust_timer(iv);
1309 goto done;
1310 }
1293 } 1311 }
1294 1312
1295 /* 1313 /*
1296 * Alert userspace if needed. If we logged an MCE, reduce the 1314 * Alert userspace if needed. If we logged an MCE, reduce the polling
1297 * polling interval, otherwise increase the polling interval. 1315 * interval, otherwise increase the polling interval.
1298 */ 1316 */
1299 iv = __this_cpu_read(mce_next_interval); 1317 if (mce_notify_irq())
1300 notify = mce_notify_irq();
1301 notify |= cmc_error_seen();
1302 if (notify) {
1303 iv = max(iv / 2, (unsigned long) HZ/100); 1318 iv = max(iv / 2, (unsigned long) HZ/100);
1304 } else { 1319 else
1305 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1320 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1306 iv = mce_adjust_timer(iv); 1321
1307 } 1322done:
1308 __this_cpu_write(mce_next_interval, iv); 1323 __this_cpu_write(mce_next_interval, iv);
1309 /* Might have become 0 after CMCI storm subsided */ 1324 __restart_timer(t, iv);
1310 if (iv) {
1311 t->expires = jiffies + iv;
1312 add_timer_on(t, smp_processor_id());
1313 }
1314} 1325}
1315 1326
1316/* 1327/*
@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data)
1319void mce_timer_kick(unsigned long interval) 1330void mce_timer_kick(unsigned long interval)
1320{ 1331{
1321 struct timer_list *t = this_cpu_ptr(&mce_timer); 1332 struct timer_list *t = this_cpu_ptr(&mce_timer);
1322 unsigned long when = jiffies + interval;
1323 unsigned long iv = __this_cpu_read(mce_next_interval); 1333 unsigned long iv = __this_cpu_read(mce_next_interval);
1324 1334
1325 if (timer_pending(t)) { 1335 __restart_timer(t, interval);
1326 if (time_before(when, t->expires)) 1336
1327 mod_timer_pinned(t, when);
1328 } else {
1329 t->expires = round_jiffies(when);
1330 add_timer_on(t, smp_processor_id());
1331 }
1332 if (interval < iv) 1337 if (interval < iv)
1333 __this_cpu_write(mce_next_interval, interval); 1338 __this_cpu_write(mce_next_interval, interval);
1334} 1339}
@@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1525 * Various K7s with broken bank 0 around. Always disable 1530 * Various K7s with broken bank 0 around. Always disable
1526 * by default. 1531 * by default.
1527 */ 1532 */
1528 if (c->x86 == 6 && cfg->banks > 0) 1533 if (c->x86 == 6 && cfg->banks > 0)
1529 mce_banks[0].ctl = 0; 1534 mce_banks[0].ctl = 0;
1530 1535
1531 /* 1536 /*
1532 * Turn off MC4_MISC thresholding banks on those models since 1537 * overflow_recov is supported for F15h Models 00h-0fh
1533 * they're not supported there. 1538 * even though we don't have a CPUID bit for it.
1534 */ 1539 */
1535 if (c->x86 == 0x15 && 1540 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1536 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1541 mce_flags.overflow_recov = 1;
1537 int i; 1542
1538 u64 val, hwcr; 1543 /*
1539 bool need_toggle; 1544 * Turn off MC4_MISC thresholding banks on those models since
1540 u32 msrs[] = { 1545 * they're not supported there.
1546 */
1547 if (c->x86 == 0x15 &&
1548 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1549 int i;
1550 u64 hwcr;
1551 bool need_toggle;
1552 u32 msrs[] = {
1541 0x00000413, /* MC4_MISC0 */ 1553 0x00000413, /* MC4_MISC0 */
1542 0xc0000408, /* MC4_MISC1 */ 1554 0xc0000408, /* MC4_MISC1 */
1543 }; 1555 };
1544 1556
1545 rdmsrl(MSR_K7_HWCR, hwcr); 1557 rdmsrl(MSR_K7_HWCR, hwcr);
1546 1558
1547 /* McStatusWrEn has to be set */ 1559 /* McStatusWrEn has to be set */
1548 need_toggle = !(hwcr & BIT(18)); 1560 need_toggle = !(hwcr & BIT(18));
1549 1561
1550 if (need_toggle) 1562 if (need_toggle)
1551 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1563 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1552 1564
1553 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1565 /* Clear CntP bit safely */
1554 rdmsrl(msrs[i], val); 1566 for (i = 0; i < ARRAY_SIZE(msrs); i++)
1567 msr_clear_bit(msrs[i], 62);
1555 1568
1556 /* CntP bit set? */ 1569 /* restore old settings */
1557 if (val & BIT_64(62)) { 1570 if (need_toggle)
1558 val &= ~BIT_64(62); 1571 wrmsrl(MSR_K7_HWCR, hwcr);
1559 wrmsrl(msrs[i], val); 1572 }
1560 }
1561 }
1562
1563 /* restore old settings */
1564 if (need_toggle)
1565 wrmsrl(MSR_K7_HWCR, hwcr);
1566 }
1567 } 1573 }
1568 1574
1569 if (c->x86_vendor == X86_VENDOR_INTEL) { 1575 if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1629 switch (c->x86_vendor) { 1635 switch (c->x86_vendor) {
1630 case X86_VENDOR_INTEL: 1636 case X86_VENDOR_INTEL:
1631 mce_intel_feature_init(c); 1637 mce_intel_feature_init(c);
1632 mce_adjust_timer = mce_intel_adjust_timer; 1638 mce_adjust_timer = cmci_intel_adjust_timer;
1633 break; 1639 break;
1634 case X86_VENDOR_AMD: 1640 case X86_VENDOR_AMD:
1635 mce_amd_feature_init(c); 1641 mce_amd_feature_init(c);
1642 mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
1636 break; 1643 break;
1637 default: 1644 default:
1638 break; 1645 break;
@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable);
2017int __init mcheck_init(void) 2024int __init mcheck_init(void)
2018{ 2025{
2019 mcheck_intel_therm_init(); 2026 mcheck_intel_therm_init();
2027 mcheck_vendor_init_severity();
2020 2028
2021 return 0; 2029 return 0;
2022} 2030}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f1c3769bbd64..55ad9b37cae8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
79 return (bank == 4); 79 return (bank == 4);
80} 80}
81 81
82static const char * const bank4_names(struct threshold_block *b) 82static const char *bank4_names(const struct threshold_block *b)
83{ 83{
84 switch (b->address) { 84 switch (b->address) {
85 /* MSR4_MISC0 */ 85 /* MSR4_MISC0 */
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
250 if (!b.interrupt_capable) 250 if (!b.interrupt_capable)
251 goto init; 251 goto init;
252 252
253 b.interrupt_enable = 1;
253 new = (high & MASK_LVTOFF_HI) >> 20; 254 new = (high & MASK_LVTOFF_HI) >> 20;
254 offset = setup_APIC_mce(offset, new); 255 offset = setup_APIC_mce(offset, new);
255 256
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
322log: 323log:
323 mce_setup(&m); 324 mce_setup(&m);
324 rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); 325 rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
326 if (!(m.status & MCI_STATUS_VAL))
327 return;
325 m.misc = ((u64)high << 32) | low; 328 m.misc = ((u64)high << 32) | low;
326 m.bank = bank; 329 m.bank = bank;
327 mce_log(&m); 330 mce_log(&m);
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
497 b->interrupt_capable = lvt_interrupt_supported(bank, high); 500 b->interrupt_capable = lvt_interrupt_supported(bank, high);
498 b->threshold_limit = THRESHOLD_MAX; 501 b->threshold_limit = THRESHOLD_MAX;
499 502
500 if (b->interrupt_capable) 503 if (b->interrupt_capable) {
501 threshold_ktype.default_attrs[2] = &interrupt_enable.attr; 504 threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
502 else 505 b->interrupt_enable = 1;
506 } else {
503 threshold_ktype.default_attrs[2] = NULL; 507 threshold_ktype.default_attrs[2] = NULL;
508 }
504 509
505 INIT_LIST_HEAD(&b->miscj); 510 INIT_LIST_HEAD(&b->miscj);
506 511
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..b4a41cf030ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -39,6 +39,15 @@
39static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); 39static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
40 40
41/* 41/*
42 * CMCI storm detection backoff counter
43 *
44 * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
45 * encountered an error. If not, we decrement it by one. We signal the end of
46 * the CMCI storm when it reaches 0.
47 */
48static DEFINE_PER_CPU(int, cmci_backoff_cnt);
49
50/*
42 * cmci_discover_lock protects against parallel discovery attempts 51 * cmci_discover_lock protects against parallel discovery attempts
43 * which could race against each other. 52 * which could race against each other.
44 */ 53 */
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
46 55
47#define CMCI_THRESHOLD 1 56#define CMCI_THRESHOLD 1
48#define CMCI_POLL_INTERVAL (30 * HZ) 57#define CMCI_POLL_INTERVAL (30 * HZ)
49#define CMCI_STORM_INTERVAL (1 * HZ) 58#define CMCI_STORM_INTERVAL (HZ)
50#define CMCI_STORM_THRESHOLD 15 59#define CMCI_STORM_THRESHOLD 15
51 60
52static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); 61static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
82 return !!(cap & MCG_CMCI_P); 91 return !!(cap & MCG_CMCI_P);
83} 92}
84 93
85void mce_intel_cmci_poll(void) 94bool mce_intel_cmci_poll(void)
86{ 95{
87 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) 96 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
88 return; 97 return false;
89 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 98
99 /*
100 * Reset the counter if we've logged an error in the last poll
101 * during the storm.
102 */
103 if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
104 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
105 else
106 this_cpu_dec(cmci_backoff_cnt);
107
108 return true;
90} 109}
91 110
92void mce_intel_hcpu_update(unsigned long cpu) 111void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
97 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; 116 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
98} 117}
99 118
100unsigned long mce_intel_adjust_timer(unsigned long interval) 119unsigned long cmci_intel_adjust_timer(unsigned long interval)
101{ 120{
102 int r; 121 if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
103 122 (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
104 if (interval < CMCI_POLL_INTERVAL) 123 mce_notify_irq();
105 return interval; 124 return CMCI_STORM_INTERVAL;
125 }
106 126
107 switch (__this_cpu_read(cmci_storm_state)) { 127 switch (__this_cpu_read(cmci_storm_state)) {
108 case CMCI_STORM_ACTIVE: 128 case CMCI_STORM_ACTIVE:
129
109 /* 130 /*
110 * We switch back to interrupt mode once the poll timer has 131 * We switch back to interrupt mode once the poll timer has
111 * silenced itself. That means no events recorded and the 132 * silenced itself. That means no events recorded and the timer
112 * timer interval is back to our poll interval. 133 * interval is back to our poll interval.
113 */ 134 */
114 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); 135 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
115 r = atomic_sub_return(1, &cmci_storm_on_cpus); 136 if (!atomic_sub_return(1, &cmci_storm_on_cpus))
116 if (r == 0)
117 pr_notice("CMCI storm subsided: switching to interrupt mode\n"); 137 pr_notice("CMCI storm subsided: switching to interrupt mode\n");
138
118 /* FALLTHROUGH */ 139 /* FALLTHROUGH */
119 140
120 case CMCI_STORM_SUBSIDED: 141 case CMCI_STORM_SUBSIDED:
121 /* 142 /*
122 * We wait for all cpus to go back to SUBSIDED 143 * We wait for all CPUs to go back to SUBSIDED state. When that
123 * state. When that happens we switch back to 144 * happens we switch back to interrupt mode.
124 * interrupt mode.
125 */ 145 */
126 if (!atomic_read(&cmci_storm_on_cpus)) { 146 if (!atomic_read(&cmci_storm_on_cpus)) {
127 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); 147 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
130 } 150 }
131 return CMCI_POLL_INTERVAL; 151 return CMCI_POLL_INTERVAL;
132 default: 152 default:
133 /* 153
134 * We have shiny weather. Let the poll do whatever it 154 /* We have shiny weather. Let the poll do whatever it thinks. */
135 * thinks.
136 */
137 return interval; 155 return interval;
138 } 156 }
139} 157}
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
178 cmci_storm_disable_banks(); 196 cmci_storm_disable_banks();
179 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); 197 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
180 r = atomic_add_return(1, &cmci_storm_on_cpus); 198 r = atomic_add_return(1, &cmci_storm_on_cpus);
181 mce_timer_kick(CMCI_POLL_INTERVAL); 199 mce_timer_kick(CMCI_STORM_INTERVAL);
200 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
182 201
183 if (r == 1) 202 if (r == 1)
184 pr_notice("CMCI storm detected: switching to poll mode\n"); 203 pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
195{ 214{
196 if (cmci_storm_detect()) 215 if (cmci_storm_detect())
197 return; 216 return;
217
198 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 218 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
199 mce_notify_irq(); 219 mce_notify_irq();
200} 220}
@@ -286,6 +306,7 @@ void cmci_recheck(void)
286 306
287 if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) 307 if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
288 return; 308 return;
309
289 local_irq_save(flags); 310 local_irq_save(flags);
290 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 311 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
291 local_irq_restore(flags); 312 local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bfbbe6195e2d..12829c3ced3c 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -21,7 +21,6 @@
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 22
23#include <linux/firmware.h> 23#include <linux/firmware.h>
24#include <linux/pci_ids.h>
25#include <linux/uaccess.h> 24#include <linux/uaccess.h>
26#include <linux/vmalloc.h> 25#include <linux/vmalloc.h>
27#include <linux/kernel.h> 26#include <linux/kernel.h>
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index d45df4bd16ab..a413a69cbd74 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -23,57 +23,6 @@
23#include <asm/processor.h> 23#include <asm/processor.h>
24#include <asm/cmdline.h> 24#include <asm/cmdline.h>
25 25
26#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
27#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
28#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
29#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
30#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
31#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
32#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
33
34#define CPUID_IS(a, b, c, ebx, ecx, edx) \
35 (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
36
37/*
38 * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
39 * x86_vendor() gets vendor id for BSP.
40 *
41 * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
42 * coding, we still use x86_vendor() to get vendor id for AP.
43 *
44 * x86_vendor() gets vendor information directly through cpuid.
45 */
46static int x86_vendor(void)
47{
48 u32 eax = 0x00000000;
49 u32 ebx, ecx = 0, edx;
50
51 native_cpuid(&eax, &ebx, &ecx, &edx);
52
53 if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
54 return X86_VENDOR_INTEL;
55
56 if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
57 return X86_VENDOR_AMD;
58
59 return X86_VENDOR_UNKNOWN;
60}
61
62static int x86_family(void)
63{
64 u32 eax = 0x00000001;
65 u32 ebx, ecx = 0, edx;
66 int x86;
67
68 native_cpuid(&eax, &ebx, &ecx, &edx);
69
70 x86 = (eax >> 8) & 0xf;
71 if (x86 == 15)
72 x86 += (eax >> 20) & 0xff;
73
74 return x86;
75}
76
77static bool __init check_loader_disabled_bsp(void) 26static bool __init check_loader_disabled_bsp(void)
78{ 27{
79#ifdef CONFIG_X86_32 28#ifdef CONFIG_X86_32
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void)
96 45
97void __init load_ucode_bsp(void) 46void __init load_ucode_bsp(void)
98{ 47{
99 int vendor, x86; 48 int vendor, family;
100 49
101 if (check_loader_disabled_bsp()) 50 if (check_loader_disabled_bsp())
102 return; 51 return;
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void)
105 return; 54 return;
106 55
107 vendor = x86_vendor(); 56 vendor = x86_vendor();
108 x86 = x86_family(); 57 family = x86_family();
109 58
110 switch (vendor) { 59 switch (vendor) {
111 case X86_VENDOR_INTEL: 60 case X86_VENDOR_INTEL:
112 if (x86 >= 6) 61 if (family >= 6)
113 load_ucode_intel_bsp(); 62 load_ucode_intel_bsp();
114 break; 63 break;
115 case X86_VENDOR_AMD: 64 case X86_VENDOR_AMD:
116 if (x86 >= 0x10) 65 if (family >= 0x10)
117 load_ucode_amd_bsp(); 66 load_ucode_amd_bsp();
118 break; 67 break;
119 default: 68 default:
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void)
132 81
133void load_ucode_ap(void) 82void load_ucode_ap(void)
134{ 83{
135 int vendor, x86; 84 int vendor, family;
136 85
137 if (check_loader_disabled_ap()) 86 if (check_loader_disabled_ap())
138 return; 87 return;
@@ -141,15 +90,15 @@ void load_ucode_ap(void)
141 return; 90 return;
142 91
143 vendor = x86_vendor(); 92 vendor = x86_vendor();
144 x86 = x86_family(); 93 family = x86_family();
145 94
146 switch (vendor) { 95 switch (vendor) {
147 case X86_VENDOR_INTEL: 96 case X86_VENDOR_INTEL:
148 if (x86 >= 6) 97 if (family >= 6)
149 load_ucode_intel_ap(); 98 load_ucode_intel_ap();
150 break; 99 break;
151 case X86_VENDOR_AMD: 100 case X86_VENDOR_AMD:
152 if (x86 >= 0x10) 101 if (family >= 0x10)
153 load_ucode_amd_ap(); 102 load_ucode_amd_ap();
154 break; 103 break;
155 default: 104 default:
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void)
179 128
180void reload_early_microcode(void) 129void reload_early_microcode(void)
181{ 130{
182 int vendor, x86; 131 int vendor, family;
183 132
184 vendor = x86_vendor(); 133 vendor = x86_vendor();
185 x86 = x86_family(); 134 family = x86_family();
186 135
187 switch (vendor) { 136 switch (vendor) {
188 case X86_VENDOR_INTEL: 137 case X86_VENDOR_INTEL:
189 if (x86 >= 6) 138 if (family >= 6)
190 reload_ucode_intel(); 139 reload_ucode_intel();
191 break; 140 break;
192 case X86_VENDOR_AMD: 141 case X86_VENDOR_AMD:
193 if (x86 >= 0x10) 142 if (family >= 0x10)
194 reload_ucode_amd(); 143 reload_ucode_amd();
195 break; 144 break;
196 default: 145 default:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 746e7fd08aad..a41beadb3db9 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
124 cpf = cpu_sig.pf; 124 cpf = cpu_sig.pf;
125 crev = cpu_sig.rev; 125 crev = cpu_sig.rev;
126 126
127 return get_matching_microcode(csig, cpf, mc_intel, crev); 127 return get_matching_microcode(csig, cpf, crev, mc_intel);
128} 128}
129 129
130static int apply_microcode_intel(int cpu) 130static int apply_microcode_intel(int cpu)
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
226 226
227 csig = uci->cpu_sig.sig; 227 csig = uci->cpu_sig.sig;
228 cpf = uci->cpu_sig.pf; 228 cpf = uci->cpu_sig.pf;
229 if (get_matching_microcode(csig, cpf, mc, new_rev)) { 229 if (get_matching_microcode(csig, cpf, new_rev, mc)) {
230 vfree(new_mc); 230 vfree(new_mc);
231 new_rev = mc_header.rev; 231 new_rev = mc_header.rev;
232 new_mc = mc; 232 new_mc = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 420eb933189c..2f49ab4ac0ae 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -16,6 +16,14 @@
16 * as published by the Free Software Foundation; either version 16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version. 17 * 2 of the License, or (at your option) any later version.
18 */ 18 */
19
20/*
21 * This needs to be before all headers so that pr_debug in printk.h doesn't turn
22 * printk calls into no_printk().
23 *
24 *#define DEBUG
25 */
26
19#include <linux/module.h> 27#include <linux/module.h>
20#include <linux/mm.h> 28#include <linux/mm.h>
21#include <linux/slab.h> 29#include <linux/slab.h>
@@ -28,6 +36,9 @@
28#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
29#include <asm/setup.h> 37#include <asm/setup.h>
30 38
39#undef pr_fmt
40#define pr_fmt(fmt) "microcode: " fmt
41
31static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; 42static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
32static struct mc_saved_data { 43static struct mc_saved_data {
33 unsigned int mc_saved_count; 44 unsigned int mc_saved_count;
@@ -35,50 +46,45 @@ static struct mc_saved_data {
35} mc_saved_data; 46} mc_saved_data;
36 47
37static enum ucode_state 48static enum ucode_state
38generic_load_microcode_early(struct microcode_intel **mc_saved_p, 49load_microcode_early(struct microcode_intel **saved,
39 unsigned int mc_saved_count, 50 unsigned int num_saved, struct ucode_cpu_info *uci)
40 struct ucode_cpu_info *uci)
41{ 51{
42 struct microcode_intel *ucode_ptr, *new_mc = NULL; 52 struct microcode_intel *ucode_ptr, *new_mc = NULL;
43 int new_rev = uci->cpu_sig.rev; 53 struct microcode_header_intel *mc_hdr;
44 enum ucode_state state = UCODE_OK; 54 int new_rev, ret, i;
45 unsigned int mc_size;
46 struct microcode_header_intel *mc_header;
47 unsigned int csig = uci->cpu_sig.sig;
48 unsigned int cpf = uci->cpu_sig.pf;
49 int i;
50 55
51 for (i = 0; i < mc_saved_count; i++) { 56 new_rev = uci->cpu_sig.rev;
52 ucode_ptr = mc_saved_p[i];
53 57
54 mc_header = (struct microcode_header_intel *)ucode_ptr; 58 for (i = 0; i < num_saved; i++) {
55 mc_size = get_totalsize(mc_header); 59 ucode_ptr = saved[i];
56 if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { 60 mc_hdr = (struct microcode_header_intel *)ucode_ptr;
57 new_rev = mc_header->rev;
58 new_mc = ucode_ptr;
59 }
60 }
61 61
62 if (!new_mc) { 62 ret = get_matching_microcode(uci->cpu_sig.sig,
63 state = UCODE_NFOUND; 63 uci->cpu_sig.pf,
64 goto out; 64 new_rev,
65 ucode_ptr);
66 if (!ret)
67 continue;
68
69 new_rev = mc_hdr->rev;
70 new_mc = ucode_ptr;
65 } 71 }
66 72
73 if (!new_mc)
74 return UCODE_NFOUND;
75
67 uci->mc = (struct microcode_intel *)new_mc; 76 uci->mc = (struct microcode_intel *)new_mc;
68out: 77 return UCODE_OK;
69 return state;
70} 78}
71 79
72static void 80static inline void
73microcode_pointer(struct microcode_intel **mc_saved, 81copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
74 unsigned long *mc_saved_in_initrd, 82 unsigned long off, int num_saved)
75 unsigned long initrd_start, int mc_saved_count)
76{ 83{
77 int i; 84 int i;
78 85
79 for (i = 0; i < mc_saved_count; i++) 86 for (i = 0; i < num_saved; i++)
80 mc_saved[i] = (struct microcode_intel *) 87 mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
81 (mc_saved_in_initrd[i] + initrd_start);
82} 88}
83 89
84#ifdef CONFIG_X86_32 90#ifdef CONFIG_X86_32
@@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp,
102#endif 108#endif
103 109
104static enum ucode_state 110static enum ucode_state
105load_microcode(struct mc_saved_data *mc_saved_data, 111load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
106 unsigned long *mc_saved_in_initrd, 112 unsigned long initrd_start, struct ucode_cpu_info *uci)
107 unsigned long initrd_start,
108 struct ucode_cpu_info *uci)
109{ 113{
110 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; 114 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
111 unsigned int count = mc_saved_data->mc_saved_count; 115 unsigned int count = mc_saved_data->mc_saved_count;
112 116
113 if (!mc_saved_data->mc_saved) { 117 if (!mc_saved_data->mc_saved) {
114 microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, 118 copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
115 initrd_start, count);
116 119
117 return generic_load_microcode_early(mc_saved_tmp, count, uci); 120 return load_microcode_early(mc_saved_tmp, count, uci);
118 } else { 121 } else {
119#ifdef CONFIG_X86_32 122#ifdef CONFIG_X86_32
120 microcode_phys(mc_saved_tmp, mc_saved_data); 123 microcode_phys(mc_saved_tmp, mc_saved_data);
121 return generic_load_microcode_early(mc_saved_tmp, count, uci); 124 return load_microcode_early(mc_saved_tmp, count, uci);
122#else 125#else
123 return generic_load_microcode_early(mc_saved_data->mc_saved, 126 return load_microcode_early(mc_saved_data->mc_saved,
124 count, uci); 127 count, uci);
125#endif 128#endif
126 } 129 }
127} 130}
128 131
129static u8 get_x86_family(unsigned long sig)
130{
131 u8 x86;
132
133 x86 = (sig >> 8) & 0xf;
134
135 if (x86 == 0xf)
136 x86 += (sig >> 20) & 0xff;
137
138 return x86;
139}
140
141static u8 get_x86_model(unsigned long sig)
142{
143 u8 x86, x86_model;
144
145 x86 = get_x86_family(sig);
146 x86_model = (sig >> 4) & 0xf;
147
148 if (x86 == 0x6 || x86 == 0xf)
149 x86_model += ((sig >> 16) & 0xf) << 4;
150
151 return x86_model;
152}
153
154/* 132/*
155 * Given CPU signature and a microcode patch, this function finds if the 133 * Given CPU signature and a microcode patch, this function finds if the
156 * microcode patch has matching family and model with the CPU. 134 * microcode patch has matching family and model with the CPU.
@@ -159,42 +137,40 @@ static enum ucode_state
159matching_model_microcode(struct microcode_header_intel *mc_header, 137matching_model_microcode(struct microcode_header_intel *mc_header,
160 unsigned long sig) 138 unsigned long sig)
161{ 139{
162 u8 x86, x86_model; 140 unsigned int fam, model;
163 u8 x86_ucode, x86_model_ucode; 141 unsigned int fam_ucode, model_ucode;
164 struct extended_sigtable *ext_header; 142 struct extended_sigtable *ext_header;
165 unsigned long total_size = get_totalsize(mc_header); 143 unsigned long total_size = get_totalsize(mc_header);
166 unsigned long data_size = get_datasize(mc_header); 144 unsigned long data_size = get_datasize(mc_header);
167 int ext_sigcount, i; 145 int ext_sigcount, i;
168 struct extended_signature *ext_sig; 146 struct extended_signature *ext_sig;
169 147
170 x86 = get_x86_family(sig); 148 fam = __x86_family(sig);
171 x86_model = get_x86_model(sig); 149 model = x86_model(sig);
172 150
173 x86_ucode = get_x86_family(mc_header->sig); 151 fam_ucode = __x86_family(mc_header->sig);
174 x86_model_ucode = get_x86_model(mc_header->sig); 152 model_ucode = x86_model(mc_header->sig);
175 153
176 if (x86 == x86_ucode && x86_model == x86_model_ucode) 154 if (fam == fam_ucode && model == model_ucode)
177 return UCODE_OK; 155 return UCODE_OK;
178 156
179 /* Look for ext. headers: */ 157 /* Look for ext. headers: */
180 if (total_size <= data_size + MC_HEADER_SIZE) 158 if (total_size <= data_size + MC_HEADER_SIZE)
181 return UCODE_NFOUND; 159 return UCODE_NFOUND;
182 160
183 ext_header = (struct extended_sigtable *) 161 ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
184 mc_header + data_size + MC_HEADER_SIZE; 162 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
185 ext_sigcount = ext_header->count; 163 ext_sigcount = ext_header->count;
186 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
187 164
188 for (i = 0; i < ext_sigcount; i++) { 165 for (i = 0; i < ext_sigcount; i++) {
189 x86_ucode = get_x86_family(ext_sig->sig); 166 fam_ucode = __x86_family(ext_sig->sig);
190 x86_model_ucode = get_x86_model(ext_sig->sig); 167 model_ucode = x86_model(ext_sig->sig);
191 168
192 if (x86 == x86_ucode && x86_model == x86_model_ucode) 169 if (fam == fam_ucode && model == model_ucode)
193 return UCODE_OK; 170 return UCODE_OK;
194 171
195 ext_sig++; 172 ext_sig++;
196 } 173 }
197
198 return UCODE_NFOUND; 174 return UCODE_NFOUND;
199} 175}
200 176
@@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data,
204 unsigned int mc_saved_count) 180 unsigned int mc_saved_count)
205{ 181{
206 int i, j; 182 int i, j;
207 struct microcode_intel **mc_saved_p; 183 struct microcode_intel **saved_ptr;
208 int ret; 184 int ret;
209 185
210 if (!mc_saved_count) 186 if (!mc_saved_count)
@@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data,
213 /* 189 /*
214 * Copy new microcode data. 190 * Copy new microcode data.
215 */ 191 */
216 mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), 192 saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
217 GFP_KERNEL); 193 if (!saved_ptr)
218 if (!mc_saved_p)
219 return -ENOMEM; 194 return -ENOMEM;
220 195
221 for (i = 0; i < mc_saved_count; i++) { 196 for (i = 0; i < mc_saved_count; i++) {
222 struct microcode_intel *mc = mc_saved_src[i]; 197 struct microcode_header_intel *mc_hdr;
223 struct microcode_header_intel *mc_header = &mc->hdr; 198 struct microcode_intel *mc;
224 unsigned long mc_size = get_totalsize(mc_header); 199 unsigned long size;
225 mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); 200
226 if (!mc_saved_p[i]) {
227 ret = -ENOMEM;
228 goto err;
229 }
230 if (!mc_saved_src[i]) { 201 if (!mc_saved_src[i]) {
231 ret = -EINVAL; 202 ret = -EINVAL;
232 goto err; 203 goto err;
233 } 204 }
234 memcpy(mc_saved_p[i], mc, mc_size); 205
206 mc = mc_saved_src[i];
207 mc_hdr = &mc->hdr;
208 size = get_totalsize(mc_hdr);
209
210 saved_ptr[i] = kmalloc(size, GFP_KERNEL);
211 if (!saved_ptr[i]) {
212 ret = -ENOMEM;
213 goto err;
214 }
215
216 memcpy(saved_ptr[i], mc, size);
235 } 217 }
236 218
237 /* 219 /*
238 * Point to newly saved microcode. 220 * Point to newly saved microcode.
239 */ 221 */
240 mc_saved_data->mc_saved = mc_saved_p; 222 mc_saved_data->mc_saved = saved_ptr;
241 mc_saved_data->mc_saved_count = mc_saved_count; 223 mc_saved_data->mc_saved_count = mc_saved_count;
242 224
243 return 0; 225 return 0;
244 226
245err: 227err:
246 for (j = 0; j <= i; j++) 228 for (j = 0; j <= i; j++)
247 kfree(mc_saved_p[j]); 229 kfree(saved_ptr[j]);
248 kfree(mc_saved_p); 230 kfree(saved_ptr);
249 231
250 return ret; 232 return ret;
251} 233}
@@ -257,48 +239,45 @@ err:
257 * - or if it is a newly discovered microcode patch. 239 * - or if it is a newly discovered microcode patch.
258 * 240 *
259 * The microcode patch should have matching model with CPU. 241 * The microcode patch should have matching model with CPU.
242 *
243 * Returns: The updated number @num_saved of saved microcode patches.
260 */ 244 */
261static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, 245static unsigned int _save_mc(struct microcode_intel **mc_saved,
262 unsigned int *mc_saved_count_p) 246 u8 *ucode_ptr, unsigned int num_saved)
263{ 247{
264 int i; 248 struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
265 int found = 0; 249 unsigned int sig, pf, new_rev;
266 unsigned int mc_saved_count = *mc_saved_count_p; 250 int found = 0, i;
267 struct microcode_header_intel *mc_header; 251
252 mc_hdr = (struct microcode_header_intel *)ucode_ptr;
253
254 for (i = 0; i < num_saved; i++) {
255 mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
256 sig = mc_saved_hdr->sig;
257 pf = mc_saved_hdr->pf;
258 new_rev = mc_hdr->rev;
259
260 if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
261 continue;
262
263 found = 1;
264
265 if (!revision_is_newer(mc_hdr, new_rev))
266 continue;
268 267
269 mc_header = (struct microcode_header_intel *)ucode_ptr;
270 for (i = 0; i < mc_saved_count; i++) {
271 unsigned int sig, pf;
272 unsigned int new_rev;
273 struct microcode_header_intel *mc_saved_header =
274 (struct microcode_header_intel *)mc_saved[i];
275 sig = mc_saved_header->sig;
276 pf = mc_saved_header->pf;
277 new_rev = mc_header->rev;
278
279 if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
280 found = 1;
281 if (update_match_revision(mc_header, new_rev)) {
282 /*
283 * Found an older ucode saved before.
284 * Replace the older one with this newer
285 * one.
286 */
287 mc_saved[i] =
288 (struct microcode_intel *)ucode_ptr;
289 break;
290 }
291 }
292 }
293 if (i >= mc_saved_count && !found)
294 /* 268 /*
295 * This ucode is first time discovered in ucode file. 269 * Found an older ucode saved earlier. Replace it with
296 * Save it to memory. 270 * this newer one.
297 */ 271 */
298 mc_saved[mc_saved_count++] = 272 mc_saved[i] = (struct microcode_intel *)ucode_ptr;
299 (struct microcode_intel *)ucode_ptr; 273 break;
274 }
275
276 /* Newly detected microcode, save it to memory. */
277 if (i >= num_saved && !found)
278 mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
300 279
301 *mc_saved_count_p = mc_saved_count; 280 return num_saved;
302} 281}
303 282
304/* 283/*
@@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
346 continue; 325 continue;
347 } 326 }
348 327
349 _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); 328 mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
350 329
351 ucode_ptr += mc_size; 330 ucode_ptr += mc_size;
352 } 331 }
@@ -372,7 +351,7 @@ out:
372static int collect_cpu_info_early(struct ucode_cpu_info *uci) 351static int collect_cpu_info_early(struct ucode_cpu_info *uci)
373{ 352{
374 unsigned int val[2]; 353 unsigned int val[2];
375 u8 x86, x86_model; 354 unsigned int family, model;
376 struct cpu_signature csig; 355 struct cpu_signature csig;
377 unsigned int eax, ebx, ecx, edx; 356 unsigned int eax, ebx, ecx, edx;
378 357
@@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
387 native_cpuid(&eax, &ebx, &ecx, &edx); 366 native_cpuid(&eax, &ebx, &ecx, &edx);
388 csig.sig = eax; 367 csig.sig = eax;
389 368
390 x86 = get_x86_family(csig.sig); 369 family = __x86_family(csig.sig);
391 x86_model = get_x86_model(csig.sig); 370 model = x86_model(csig.sig);
392 371
393 if ((x86_model >= 5) || (x86 > 6)) { 372 if ((model >= 5) || (family > 6)) {
394 /* get processor flags from MSR 0x17 */ 373 /* get processor flags from MSR 0x17 */
395 native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); 374 native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
396 csig.pf = 1 << ((val[1] >> 18) & 7); 375 csig.pf = 1 << ((val[1] >> 18) & 7);
@@ -429,8 +408,7 @@ static void __ref show_saved_mc(void)
429 sig = uci.cpu_sig.sig; 408 sig = uci.cpu_sig.sig;
430 pf = uci.cpu_sig.pf; 409 pf = uci.cpu_sig.pf;
431 rev = uci.cpu_sig.rev; 410 rev = uci.cpu_sig.rev;
432 pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", 411 pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
433 smp_processor_id(), sig, pf, rev);
434 412
435 for (i = 0; i < mc_saved_data.mc_saved_count; i++) { 413 for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
436 struct microcode_header_intel *mc_saved_header; 414 struct microcode_header_intel *mc_saved_header;
@@ -457,8 +435,7 @@ static void __ref show_saved_mc(void)
457 if (total_size <= data_size + MC_HEADER_SIZE) 435 if (total_size <= data_size + MC_HEADER_SIZE)
458 continue; 436 continue;
459 437
460 ext_header = (struct extended_sigtable *) 438 ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
461 mc_saved_header + data_size + MC_HEADER_SIZE;
462 ext_sigcount = ext_header->count; 439 ext_sigcount = ext_header->count;
463 ext_sig = (void *)ext_header + EXT_HEADER_SIZE; 440 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
464 441
@@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc)
515 * Save the microcode patch mc in mc_save_tmp structure if it's a newer 492 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
516 * version. 493 * version.
517 */ 494 */
518 495 mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
519 _save_mc(mc_saved_tmp, mc, &mc_saved_count);
520 496
521 /* 497 /*
522 * Save the mc_save_tmp in global mc_saved_data. 498 * Save the mc_save_tmp in global mc_saved_data.
@@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
548 524
549static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; 525static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
550static __init enum ucode_state 526static __init enum ucode_state
551scan_microcode(unsigned long start, unsigned long end, 527scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
552 struct mc_saved_data *mc_saved_data, 528 unsigned long start, unsigned long size,
553 unsigned long *mc_saved_in_initrd, 529 struct ucode_cpu_info *uci)
554 struct ucode_cpu_info *uci)
555{ 530{
556 unsigned int size = end - start + 1;
557 struct cpio_data cd; 531 struct cpio_data cd;
558 long offset = 0; 532 long offset = 0;
559#ifdef CONFIG_X86_32 533#ifdef CONFIG_X86_32
@@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end,
569 if (!cd.data) 543 if (!cd.data)
570 return UCODE_ERROR; 544 return UCODE_ERROR;
571 545
572
573 return get_matching_model_microcode(0, start, cd.data, cd.size, 546 return get_matching_model_microcode(0, start, cd.data, cd.size,
574 mc_saved_data, mc_saved_in_initrd, 547 mc_saved_data, initrd, uci);
575 uci);
576} 548}
577 549
578/* 550/*
@@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void)
704 if (count == 0) 676 if (count == 0)
705 return ret; 677 return ret;
706 678
707 microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); 679 copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
708 ret = save_microcode(&mc_saved_data, mc_saved, count); 680 ret = save_microcode(&mc_saved_data, mc_saved, count);
709 if (ret) 681 if (ret)
710 pr_err("Cannot save microcode patches from initrd.\n"); 682 pr_err("Cannot save microcode patches from initrd.\n");
@@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void)
716 688
717static void __init 689static void __init
718_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, 690_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
719 unsigned long *mc_saved_in_initrd, 691 unsigned long *initrd,
720 unsigned long initrd_start_early, 692 unsigned long start, unsigned long size)
721 unsigned long initrd_end_early,
722 struct ucode_cpu_info *uci)
723{ 693{
694 struct ucode_cpu_info uci;
724 enum ucode_state ret; 695 enum ucode_state ret;
725 696
726 collect_cpu_info_early(uci); 697 collect_cpu_info_early(&uci);
727 scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
728 mc_saved_in_initrd, uci);
729 698
730 ret = load_microcode(mc_saved_data, mc_saved_in_initrd, 699 ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
731 initrd_start_early, uci); 700 if (ret != UCODE_OK)
701 return;
732 702
733 if (ret == UCODE_OK) 703 ret = load_microcode(mc_saved_data, initrd, start, &uci);
734 apply_microcode_early(uci, true); 704 if (ret != UCODE_OK)
705 return;
706
707 apply_microcode_early(&uci, true);
735} 708}
736 709
737void __init 710void __init load_ucode_intel_bsp(void)
738load_ucode_intel_bsp(void)
739{ 711{
740 u64 ramdisk_image, ramdisk_size; 712 u64 start, size;
741 unsigned long initrd_start_early, initrd_end_early;
742 struct ucode_cpu_info uci;
743#ifdef CONFIG_X86_32 713#ifdef CONFIG_X86_32
744 struct boot_params *boot_params_p; 714 struct boot_params *p;
745 715
746 boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); 716 p = (struct boot_params *)__pa_nodebug(&boot_params);
747 ramdisk_image = boot_params_p->hdr.ramdisk_image; 717 start = p->hdr.ramdisk_image;
748 ramdisk_size = boot_params_p->hdr.ramdisk_size; 718 size = p->hdr.ramdisk_size;
749 initrd_start_early = ramdisk_image;
750 initrd_end_early = initrd_start_early + ramdisk_size;
751 719
752 _load_ucode_intel_bsp( 720 _load_ucode_intel_bsp(
753 (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), 721 (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
754 (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), 722 (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
755 initrd_start_early, initrd_end_early, &uci); 723 start, size);
756#else 724#else
757 ramdisk_image = boot_params.hdr.ramdisk_image; 725 start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
758 ramdisk_size = boot_params.hdr.ramdisk_size; 726 size = boot_params.hdr.ramdisk_size;
759 initrd_start_early = ramdisk_image + PAGE_OFFSET; 727
760 initrd_end_early = initrd_start_early + ramdisk_size; 728 _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
761
762 _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
763 initrd_start_early, initrd_end_early,
764 &uci);
765#endif 729#endif
766} 730}
767 731
@@ -771,6 +735,7 @@ void load_ucode_intel_ap(void)
771 struct ucode_cpu_info uci; 735 struct ucode_cpu_info uci;
772 unsigned long *mc_saved_in_initrd_p; 736 unsigned long *mc_saved_in_initrd_p;
773 unsigned long initrd_start_addr; 737 unsigned long initrd_start_addr;
738 enum ucode_state ret;
774#ifdef CONFIG_X86_32 739#ifdef CONFIG_X86_32
775 unsigned long *initrd_start_p; 740 unsigned long *initrd_start_p;
776 741
@@ -793,8 +758,12 @@ void load_ucode_intel_ap(void)
793 return; 758 return;
794 759
795 collect_cpu_info_early(&uci); 760 collect_cpu_info_early(&uci);
796 load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, 761 ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
797 initrd_start_addr, &uci); 762 initrd_start_addr, &uci);
763
764 if (ret != UCODE_OK)
765 return;
766
798 apply_microcode_early(&uci, true); 767 apply_microcode_early(&uci, true);
799} 768}
800 769
@@ -808,8 +777,8 @@ void reload_ucode_intel(void)
808 777
809 collect_cpu_info_early(&uci); 778 collect_cpu_info_early(&uci);
810 779
811 ret = generic_load_microcode_early(mc_saved_data.mc_saved, 780 ret = load_microcode_early(mc_saved_data.mc_saved,
812 mc_saved_data.mc_saved_count, &uci); 781 mc_saved_data.mc_saved_count, &uci);
813 if (ret != UCODE_OK) 782 if (ret != UCODE_OK)
814 return; 783 return;
815 784
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..cd47a510a3f1 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf,
38 return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; 38 return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
39} 39}
40 40
41int
42update_match_revision(struct microcode_header_intel *mc_header, int rev)
43{
44 return (mc_header->rev <= rev) ? 0 : 1;
45}
46
47int microcode_sanity_check(void *mc, int print_err) 41int microcode_sanity_check(void *mc, int print_err)
48{ 42{
49 unsigned long total_size, data_size, ext_table_size; 43 unsigned long total_size, data_size, ext_table_size;
@@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err)
128EXPORT_SYMBOL_GPL(microcode_sanity_check); 122EXPORT_SYMBOL_GPL(microcode_sanity_check);
129 123
130/* 124/*
131 * return 0 - no update found 125 * Returns 1 if update has been found, 0 otherwise.
132 * return 1 - found update
133 */ 126 */
134int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) 127int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
135{ 128{
136 struct microcode_header_intel *mc_header = mc; 129 struct microcode_header_intel *mc_header = mc;
137 struct extended_sigtable *ext_header; 130 struct extended_sigtable *ext_header;
@@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
159} 152}
160 153
161/* 154/*
162 * return 0 - no update found 155 * Returns 1 if update has been found, 0 otherwise.
163 * return 1 - found update
164 */ 156 */
165int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) 157int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
166{ 158{
167 struct microcode_header_intel *mc_header = mc; 159 struct microcode_header_intel *mc_hdr = mc;
168 160
169 if (!update_match_revision(mc_header, rev)) 161 if (!revision_is_newer(mc_hdr, rev))
170 return 0; 162 return 0;
171 163
172 return get_matching_sig(csig, cpf, mc, rev); 164 return get_matching_sig(csig, cpf, rev, mc);
173} 165}
174EXPORT_SYMBOL_GPL(get_matching_microcode); 166EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 36d99a337b49..3f20710a5b23 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,7 +6,7 @@
6IN=$1 6IN=$1
7OUT=$2 7OUT=$2
8 8
9function dump_array() 9dump_array()
10{ 10{
11 ARRAY=$1 11 ARRAY=$1
12 SIZE=$2 12 SIZE=$2
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..d76f13d6d8d6 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = {
404static int mtrr_seq_show(struct seq_file *seq, void *offset) 404static int mtrr_seq_show(struct seq_file *seq, void *offset)
405{ 405{
406 char factor; 406 char factor;
407 int i, max, len; 407 int i, max;
408 mtrr_type type; 408 mtrr_type type;
409 unsigned long base, size; 409 unsigned long base, size;
410 410
411 len = 0;
412 max = num_var_ranges; 411 max = num_var_ranges;
413 for (i = 0; i < max; i++) { 412 for (i = 0; i < max; i++) {
414 mtrr_if->get(i, &base, &size, &type); 413 mtrr_if->get(i, &base, &size, &type);
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
425 size >>= 20 - PAGE_SHIFT; 424 size >>= 20 - PAGE_SHIFT;
426 } 425 }
427 /* Base can be > 32bit */ 426 /* Base can be > 32bit */
428 len += seq_printf(seq, "reg%02i: base=0x%06lx000 " 427 seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
429 "(%5luMB), size=%5lu%cB, count=%d: %s\n", 428 i, base, base >> (20 - PAGE_SHIFT),
430 i, base, base >> (20 - PAGE_SHIFT), size, 429 size, factor,
431 factor, mtrr_usage_table[i], 430 mtrr_usage_table[i], mtrr_attrib_to_str(type));
432 mtrr_attrib_to_str(type));
433 } 431 }
434 return 0; 432 return 0;
435} 433}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..87848ebe2bb7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
263 } 263 }
264} 264}
265 265
266void hw_perf_lbr_event_destroy(struct perf_event *event)
267{
268 hw_perf_event_destroy(event);
269
270 /* undo the lbr/bts event accounting */
271 x86_del_exclusive(x86_lbr_exclusive_lbr);
272}
273
266static inline int x86_pmu_initialized(void) 274static inline int x86_pmu_initialized(void)
267{ 275{
268 return x86_pmu.handle_irq != NULL; 276 return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
302 return x86_pmu_extra_regs(val, event); 310 return x86_pmu_extra_regs(val, event);
303} 311}
304 312
313/*
314 * Check if we can create event of a certain type (that no conflicting events
315 * are present).
316 */
317int x86_add_exclusive(unsigned int what)
318{
319 int ret = -EBUSY, i;
320
321 if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
322 return 0;
323
324 mutex_lock(&pmc_reserve_mutex);
325 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
326 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
327 goto out;
328
329 atomic_inc(&x86_pmu.lbr_exclusive[what]);
330 ret = 0;
331
332out:
333 mutex_unlock(&pmc_reserve_mutex);
334 return ret;
335}
336
337void x86_del_exclusive(unsigned int what)
338{
339 atomic_dec(&x86_pmu.lbr_exclusive[what]);
340}
341
305int x86_setup_perfctr(struct perf_event *event) 342int x86_setup_perfctr(struct perf_event *event)
306{ 343{
307 struct perf_event_attr *attr = &event->attr; 344 struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
346 /* BTS is currently only allowed for user-mode. */ 383 /* BTS is currently only allowed for user-mode. */
347 if (!attr->exclude_kernel) 384 if (!attr->exclude_kernel)
348 return -EOPNOTSUPP; 385 return -EOPNOTSUPP;
386
387 /* disallow bts if conflicting events are present */
388 if (x86_add_exclusive(x86_lbr_exclusive_lbr))
389 return -EBUSY;
390
391 event->destroy = hw_perf_lbr_event_destroy;
349 } 392 }
350 393
351 hwc->config |= config; 394 hwc->config |= config;
@@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event)
399 442
400 if (event->attr.precise_ip > precise) 443 if (event->attr.precise_ip > precise)
401 return -EOPNOTSUPP; 444 return -EOPNOTSUPP;
402 /* 445 }
403 * check that PEBS LBR correction does not conflict with 446 /*
404 * whatever the user is asking with attr->branch_sample_type 447 * check that PEBS LBR correction does not conflict with
405 */ 448 * whatever the user is asking with attr->branch_sample_type
406 if (event->attr.precise_ip > 1 && 449 */
407 x86_pmu.intel_cap.pebs_format < 2) { 450 if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
408 u64 *br_type = &event->attr.branch_sample_type; 451 u64 *br_type = &event->attr.branch_sample_type;
409 452
410 if (has_branch_stack(event)) { 453 if (has_branch_stack(event)) {
411 if (!precise_br_compat(event)) 454 if (!precise_br_compat(event))
412 return -EOPNOTSUPP; 455 return -EOPNOTSUPP;
413 456
414 /* branch_sample_type is compatible */ 457 /* branch_sample_type is compatible */
415 458
416 } else { 459 } else {
417 /* 460 /*
418 * user did not specify branch_sample_type 461 * user did not specify branch_sample_type
419 * 462 *
420 * For PEBS fixups, we capture all 463 * For PEBS fixups, we capture all
421 * the branches at the priv level of the 464 * the branches at the priv level of the
422 * event. 465 * event.
423 */ 466 */
424 *br_type = PERF_SAMPLE_BRANCH_ANY; 467 *br_type = PERF_SAMPLE_BRANCH_ANY;
425 468
426 if (!event->attr.exclude_user) 469 if (!event->attr.exclude_user)
427 *br_type |= PERF_SAMPLE_BRANCH_USER; 470 *br_type |= PERF_SAMPLE_BRANCH_USER;
428 471
429 if (!event->attr.exclude_kernel) 472 if (!event->attr.exclude_kernel)
430 *br_type |= PERF_SAMPLE_BRANCH_KERNEL; 473 *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
431 }
432 } 474 }
433 } 475 }
434 476
477 if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
478 event->attach_state |= PERF_ATTACH_TASK_DATA;
479
435 /* 480 /*
436 * Generate PMC IRQs: 481 * Generate PMC IRQs:
437 * (keep 'enabled' bit clear for now) 482 * (keep 'enabled' bit clear for now)
@@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event)
449 if (event->attr.type == PERF_TYPE_RAW) 494 if (event->attr.type == PERF_TYPE_RAW)
450 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; 495 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
451 496
497 if (event->attr.sample_period && x86_pmu.limit_period) {
498 if (x86_pmu.limit_period(event, event->attr.sample_period) >
499 event->attr.sample_period)
500 return -EINVAL;
501 }
502
452 return x86_setup_perfctr(event); 503 return x86_setup_perfctr(event);
453} 504}
454 505
@@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
728 struct event_constraint *c; 779 struct event_constraint *c;
729 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 780 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
730 struct perf_event *e; 781 struct perf_event *e;
731 int i, wmin, wmax, num = 0; 782 int i, wmin, wmax, unsched = 0;
732 struct hw_perf_event *hwc; 783 struct hw_perf_event *hwc;
733 784
734 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 785 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
735 786
787 if (x86_pmu.start_scheduling)
788 x86_pmu.start_scheduling(cpuc);
789
736 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { 790 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
737 hwc = &cpuc->event_list[i]->hw; 791 hwc = &cpuc->event_list[i]->hw;
738 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); 792 c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
739 hwc->constraint = c; 793 hwc->constraint = c;
740 794
741 wmin = min(wmin, c->weight); 795 wmin = min(wmin, c->weight);
@@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
768 822
769 /* slow path */ 823 /* slow path */
770 if (i != n) 824 if (i != n)
771 num = perf_assign_events(cpuc->event_list, n, wmin, 825 unsched = perf_assign_events(cpuc->event_list, n, wmin,
772 wmax, assign); 826 wmax, assign);
773 827
774 /* 828 /*
775 * Mark the event as committed, so we do not put_constraint() 829 * In case of success (unsched = 0), mark events as committed,
776 * in case new events are added and fail scheduling. 830 * so we do not put_constraint() in case new events are added
831 * and fail to be scheduled
832 *
833 * We invoke the lower level commit callback to lock the resource
834 *
835 * We do not need to do all of this in case we are called to
836 * validate an event group (assign == NULL)
777 */ 837 */
778 if (!num && assign) { 838 if (!unsched && assign) {
779 for (i = 0; i < n; i++) { 839 for (i = 0; i < n; i++) {
780 e = cpuc->event_list[i]; 840 e = cpuc->event_list[i];
781 e->hw.flags |= PERF_X86_EVENT_COMMITTED; 841 e->hw.flags |= PERF_X86_EVENT_COMMITTED;
842 if (x86_pmu.commit_scheduling)
843 x86_pmu.commit_scheduling(cpuc, e, assign[i]);
782 } 844 }
783 } 845 }
784 /* 846
785 * scheduling failed or is just a simulation, 847 if (!assign || unsched) {
786 * free resources if necessary 848
787 */
788 if (!assign || num) {
789 for (i = 0; i < n; i++) { 849 for (i = 0; i < n; i++) {
790 e = cpuc->event_list[i]; 850 e = cpuc->event_list[i];
791 /* 851 /*
@@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
795 if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) 855 if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
796 continue; 856 continue;
797 857
858 /*
859 * release events that failed scheduling
860 */
798 if (x86_pmu.put_event_constraints) 861 if (x86_pmu.put_event_constraints)
799 x86_pmu.put_event_constraints(cpuc, e); 862 x86_pmu.put_event_constraints(cpuc, e);
800 } 863 }
801 } 864 }
802 return num ? -EINVAL : 0; 865
866 if (x86_pmu.stop_scheduling)
867 x86_pmu.stop_scheduling(cpuc);
868
869 return unsched ? -EINVAL : 0;
803} 870}
804 871
805/* 872/*
@@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event)
986 if (left > x86_pmu.max_period) 1053 if (left > x86_pmu.max_period)
987 left = x86_pmu.max_period; 1054 left = x86_pmu.max_period;
988 1055
1056 if (x86_pmu.limit_period)
1057 left = x86_pmu.limit_period(event, left);
1058
989 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; 1059 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
990 1060
991 /* 1061 /*
@@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1033 1103
1034 hwc = &event->hw; 1104 hwc = &event->hw;
1035 1105
1036 perf_pmu_disable(event->pmu);
1037 n0 = cpuc->n_events; 1106 n0 = cpuc->n_events;
1038 ret = n = collect_events(cpuc, event, false); 1107 ret = n = collect_events(cpuc, event, false);
1039 if (ret < 0) 1108 if (ret < 0)
@@ -1071,7 +1140,6 @@ done_collect:
1071 1140
1072 ret = 0; 1141 ret = 0;
1073out: 1142out:
1074 perf_pmu_enable(event->pmu);
1075 return ret; 1143 return ret;
1076} 1144}
1077 1145
@@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
1103void perf_event_print_debug(void) 1171void perf_event_print_debug(void)
1104{ 1172{
1105 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1173 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1106 u64 pebs; 1174 u64 pebs, debugctl;
1107 struct cpu_hw_events *cpuc; 1175 struct cpu_hw_events *cpuc;
1108 unsigned long flags; 1176 unsigned long flags;
1109 int cpu, idx; 1177 int cpu, idx;
@@ -1121,14 +1189,20 @@ void perf_event_print_debug(void)
1121 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1189 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1122 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1190 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1123 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1191 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1124 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1125 1192
1126 pr_info("\n"); 1193 pr_info("\n");
1127 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1194 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1128 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1195 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1129 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1196 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1130 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1197 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1131 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); 1198 if (x86_pmu.pebs_constraints) {
1199 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1200 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1201 }
1202 if (x86_pmu.lbr_nr) {
1203 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1204 pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
1205 }
1132 } 1206 }
1133 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1207 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1134 1208
@@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1321{ 1395{
1322 unsigned int cpu = (long)hcpu; 1396 unsigned int cpu = (long)hcpu;
1323 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1397 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1324 int ret = NOTIFY_OK; 1398 int i, ret = NOTIFY_OK;
1325 1399
1326 switch (action & ~CPU_TASKS_FROZEN) { 1400 switch (action & ~CPU_TASKS_FROZEN) {
1327 case CPU_UP_PREPARE: 1401 case CPU_UP_PREPARE:
1328 cpuc->kfree_on_online = NULL; 1402 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
1403 cpuc->kfree_on_online[i] = NULL;
1329 if (x86_pmu.cpu_prepare) 1404 if (x86_pmu.cpu_prepare)
1330 ret = x86_pmu.cpu_prepare(cpu); 1405 ret = x86_pmu.cpu_prepare(cpu);
1331 break; 1406 break;
@@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1336 break; 1411 break;
1337 1412
1338 case CPU_ONLINE: 1413 case CPU_ONLINE:
1339 kfree(cpuc->kfree_on_online); 1414 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
1415 kfree(cpuc->kfree_on_online[i]);
1416 cpuc->kfree_on_online[i] = NULL;
1417 }
1340 break; 1418 break;
1341 1419
1342 case CPU_DYING: 1420 case CPU_DYING:
@@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event)
1712 if (IS_ERR(fake_cpuc)) 1790 if (IS_ERR(fake_cpuc))
1713 return PTR_ERR(fake_cpuc); 1791 return PTR_ERR(fake_cpuc);
1714 1792
1715 c = x86_pmu.get_event_constraints(fake_cpuc, event); 1793 c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
1716 1794
1717 if (!c || !c->weight) 1795 if (!c || !c->weight)
1718 ret = -EINVAL; 1796 ret = -EINVAL;
@@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
1914 NULL, 1992 NULL,
1915}; 1993};
1916 1994
1917static void x86_pmu_flush_branch_stack(void) 1995static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
1918{ 1996{
1919 if (x86_pmu.flush_branch_stack) 1997 if (x86_pmu.sched_task)
1920 x86_pmu.flush_branch_stack(); 1998 x86_pmu.sched_task(ctx, sched_in);
1921} 1999}
1922 2000
1923void perf_check_microcode(void) 2001void perf_check_microcode(void)
@@ -1949,7 +2027,8 @@ static struct pmu pmu = {
1949 .commit_txn = x86_pmu_commit_txn, 2027 .commit_txn = x86_pmu_commit_txn,
1950 2028
1951 .event_idx = x86_pmu_event_idx, 2029 .event_idx = x86_pmu_event_idx,
1952 .flush_branch_stack = x86_pmu_flush_branch_stack, 2030 .sched_task = x86_pmu_sched_task,
2031 .task_ctx_size = sizeof(struct x86_perf_task_context),
1953}; 2032};
1954 2033
1955void arch_perf_update_userpage(struct perf_event *event, 2034void arch_perf_update_userpage(struct perf_event *event,
@@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event,
1968 2047
1969 data = cyc2ns_read_begin(); 2048 data = cyc2ns_read_begin();
1970 2049
2050 /*
2051 * Internal timekeeping for enabled/running/stopped times
2052 * is always in the local_clock domain.
2053 */
1971 userpg->cap_user_time = 1; 2054 userpg->cap_user_time = 1;
1972 userpg->time_mult = data->cyc2ns_mul; 2055 userpg->time_mult = data->cyc2ns_mul;
1973 userpg->time_shift = data->cyc2ns_shift; 2056 userpg->time_shift = data->cyc2ns_shift;
1974 userpg->time_offset = data->cyc2ns_offset - now; 2057 userpg->time_offset = data->cyc2ns_offset - now;
1975 2058
1976 userpg->cap_user_time_zero = 1; 2059 /*
1977 userpg->time_zero = data->cyc2ns_offset; 2060 * cap_user_time_zero doesn't make sense when we're using a different
2061 * time base for the records.
2062 */
2063 if (event->clock == &local_clock) {
2064 userpg->cap_user_time_zero = 1;
2065 userpg->time_zero = data->cyc2ns_offset;
2066 }
1978 2067
1979 cyc2ns_read_end(data); 2068 cyc2ns_read_end(data);
1980} 2069}
@@ -2147,24 +2236,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
2147static unsigned long code_segment_base(struct pt_regs *regs) 2236static unsigned long code_segment_base(struct pt_regs *regs)
2148{ 2237{
2149 /* 2238 /*
2239 * For IA32 we look at the GDT/LDT segment base to convert the
2240 * effective IP to a linear address.
2241 */
2242
2243#ifdef CONFIG_X86_32
2244 /*
2150 * If we are in VM86 mode, add the segment offset to convert to a 2245 * If we are in VM86 mode, add the segment offset to convert to a
2151 * linear address. 2246 * linear address.
2152 */ 2247 */
2153 if (regs->flags & X86_VM_MASK) 2248 if (regs->flags & X86_VM_MASK)
2154 return 0x10 * regs->cs; 2249 return 0x10 * regs->cs;
2155 2250
2156 /*
2157 * For IA32 we look at the GDT/LDT segment base to convert the
2158 * effective IP to a linear address.
2159 */
2160#ifdef CONFIG_X86_32
2161 if (user_mode(regs) && regs->cs != __USER_CS) 2251 if (user_mode(regs) && regs->cs != __USER_CS)
2162 return get_segment_base(regs->cs); 2252 return get_segment_base(regs->cs);
2163#else 2253#else
2164 if (test_thread_flag(TIF_IA32)) { 2254 if (user_mode(regs) && !user_64bit_mode(regs) &&
2165 if (user_mode(regs) && regs->cs != __USER32_CS) 2255 regs->cs != __USER32_CS)
2166 return get_segment_base(regs->cs); 2256 return get_segment_base(regs->cs);
2167 }
2168#endif 2257#endif
2169 return 0; 2258 return 0;
2170} 2259}
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index df525d2be1e8..6ac5cb7a9e14 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -65,13 +65,15 @@ struct event_constraint {
65/* 65/*
66 * struct hw_perf_event.flags flags 66 * struct hw_perf_event.flags flags
67 */ 67 */
68#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ 68#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */
69#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ 69#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */
70#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */ 70#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */
71#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ 71#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */
72#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ 72#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */
73#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ 73#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */
74#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ 74#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
75#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
76#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
75 77
76 78
77struct amd_nb { 79struct amd_nb {
@@ -123,8 +125,37 @@ struct intel_shared_regs {
123 unsigned core_id; /* per-core: core id */ 125 unsigned core_id; /* per-core: core id */
124}; 126};
125 127
128enum intel_excl_state_type {
129 INTEL_EXCL_UNUSED = 0, /* counter is unused */
130 INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */
131 INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
132};
133
134struct intel_excl_states {
135 enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
136 enum intel_excl_state_type state[X86_PMC_IDX_MAX];
137 int num_alloc_cntrs;/* #counters allocated */
138 int max_alloc_cntrs;/* max #counters allowed */
139 bool sched_started; /* true if scheduling has started */
140};
141
142struct intel_excl_cntrs {
143 raw_spinlock_t lock;
144
145 struct intel_excl_states states[2];
146
147 int refcnt; /* per-core: #HT threads */
148 unsigned core_id; /* per-core: core id */
149};
150
126#define MAX_LBR_ENTRIES 16 151#define MAX_LBR_ENTRIES 16
127 152
153enum {
154 X86_PERF_KFREE_SHARED = 0,
155 X86_PERF_KFREE_EXCL = 1,
156 X86_PERF_KFREE_MAX
157};
158
128struct cpu_hw_events { 159struct cpu_hw_events {
129 /* 160 /*
130 * Generic x86 PMC bits 161 * Generic x86 PMC bits
@@ -179,6 +210,12 @@ struct cpu_hw_events {
179 * used on Intel NHM/WSM/SNB 210 * used on Intel NHM/WSM/SNB
180 */ 211 */
181 struct intel_shared_regs *shared_regs; 212 struct intel_shared_regs *shared_regs;
213 /*
214 * manage exclusive counter access between hyperthread
215 */
216 struct event_constraint *constraint_list; /* in enable order */
217 struct intel_excl_cntrs *excl_cntrs;
218 int excl_thread_id; /* 0 or 1 */
182 219
183 /* 220 /*
184 * AMD specific bits 221 * AMD specific bits
@@ -187,7 +224,7 @@ struct cpu_hw_events {
187 /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ 224 /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
188 u64 perf_ctr_virt_mask; 225 u64 perf_ctr_virt_mask;
189 226
190 void *kfree_on_online; 227 void *kfree_on_online[X86_PERF_KFREE_MAX];
191}; 228};
192 229
193#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ 230#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
@@ -202,6 +239,10 @@ struct cpu_hw_events {
202#define EVENT_CONSTRAINT(c, n, m) \ 239#define EVENT_CONSTRAINT(c, n, m) \
203 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) 240 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
204 241
242#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
243 __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
244 0, PERF_X86_EVENT_EXCL)
245
205/* 246/*
206 * The overlap flag marks event constraints with overlapping counter 247 * The overlap flag marks event constraints with overlapping counter
207 * masks. This is the case if the counter mask of such an event is not 248 * masks. This is the case if the counter mask of such an event is not
@@ -259,6 +300,10 @@ struct cpu_hw_events {
259#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ 300#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
260 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) 301 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
261 302
303#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \
304 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
305 HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
306
262#define INTEL_PLD_CONSTRAINT(c, n) \ 307#define INTEL_PLD_CONSTRAINT(c, n) \
263 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ 308 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
264 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) 309 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
@@ -283,22 +328,40 @@ struct cpu_hw_events {
283 328
284/* Check flags and event code, and set the HSW load flag */ 329/* Check flags and event code, and set the HSW load flag */
285#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \ 330#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
286 __EVENT_CONSTRAINT(code, n, \ 331 __EVENT_CONSTRAINT(code, n, \
287 ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ 332 ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
288 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) 333 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
289 334
335#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
336 __EVENT_CONSTRAINT(code, n, \
337 ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
338 HWEIGHT(n), 0, \
339 PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
340
290/* Check flags and event code/umask, and set the HSW store flag */ 341/* Check flags and event code/umask, and set the HSW store flag */
291#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \ 342#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
292 __EVENT_CONSTRAINT(code, n, \ 343 __EVENT_CONSTRAINT(code, n, \
293 INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ 344 INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
294 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) 345 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
295 346
347#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
348 __EVENT_CONSTRAINT(code, n, \
349 INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
350 HWEIGHT(n), 0, \
351 PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
352
296/* Check flags and event code/umask, and set the HSW load flag */ 353/* Check flags and event code/umask, and set the HSW load flag */
297#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \ 354#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
298 __EVENT_CONSTRAINT(code, n, \ 355 __EVENT_CONSTRAINT(code, n, \
299 INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ 356 INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
300 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) 357 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
301 358
359#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
360 __EVENT_CONSTRAINT(code, n, \
361 INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
362 HWEIGHT(n), 0, \
363 PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
364
302/* Check flags and event code/umask, and set the HSW N/A flag */ 365/* Check flags and event code/umask, and set the HSW N/A flag */
303#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ 366#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
304 __EVENT_CONSTRAINT(code, n, \ 367 __EVENT_CONSTRAINT(code, n, \
@@ -408,6 +471,13 @@ union x86_pmu_config {
408 471
409#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value 472#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
410 473
474enum {
475 x86_lbr_exclusive_lbr,
476 x86_lbr_exclusive_bts,
477 x86_lbr_exclusive_pt,
478 x86_lbr_exclusive_max,
479};
480
411/* 481/*
412 * struct x86_pmu - generic x86 pmu 482 * struct x86_pmu - generic x86 pmu
413 */ 483 */
@@ -443,14 +513,25 @@ struct x86_pmu {
443 u64 max_period; 513 u64 max_period;
444 struct event_constraint * 514 struct event_constraint *
445 (*get_event_constraints)(struct cpu_hw_events *cpuc, 515 (*get_event_constraints)(struct cpu_hw_events *cpuc,
516 int idx,
446 struct perf_event *event); 517 struct perf_event *event);
447 518
448 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 519 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
449 struct perf_event *event); 520 struct perf_event *event);
521
522 void (*commit_scheduling)(struct cpu_hw_events *cpuc,
523 struct perf_event *event,
524 int cntr);
525
526 void (*start_scheduling)(struct cpu_hw_events *cpuc);
527
528 void (*stop_scheduling)(struct cpu_hw_events *cpuc);
529
450 struct event_constraint *event_constraints; 530 struct event_constraint *event_constraints;
451 struct x86_pmu_quirk *quirks; 531 struct x86_pmu_quirk *quirks;
452 int perfctr_second_write; 532 int perfctr_second_write;
453 bool late_ack; 533 bool late_ack;
534 unsigned (*limit_period)(struct perf_event *event, unsigned l);
454 535
455 /* 536 /*
456 * sysfs attrs 537 * sysfs attrs
@@ -472,7 +553,8 @@ struct x86_pmu {
472 void (*cpu_dead)(int cpu); 553 void (*cpu_dead)(int cpu);
473 554
474 void (*check_microcode)(void); 555 void (*check_microcode)(void);
475 void (*flush_branch_stack)(void); 556 void (*sched_task)(struct perf_event_context *ctx,
557 bool sched_in);
476 558
477 /* 559 /*
478 * Intel Arch Perfmon v2+ 560 * Intel Arch Perfmon v2+
@@ -504,10 +586,15 @@ struct x86_pmu {
504 bool lbr_double_abort; /* duplicated lbr aborts */ 586 bool lbr_double_abort; /* duplicated lbr aborts */
505 587
506 /* 588 /*
589 * Intel PT/LBR/BTS are exclusive
590 */
591 atomic_t lbr_exclusive[x86_lbr_exclusive_max];
592
593 /*
507 * Extra registers for events 594 * Extra registers for events
508 */ 595 */
509 struct extra_reg *extra_regs; 596 struct extra_reg *extra_regs;
510 unsigned int er_flags; 597 unsigned int flags;
511 598
512 /* 599 /*
513 * Intel host/guest support (KVM) 600 * Intel host/guest support (KVM)
@@ -515,6 +602,13 @@ struct x86_pmu {
515 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); 602 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
516}; 603};
517 604
605struct x86_perf_task_context {
606 u64 lbr_from[MAX_LBR_ENTRIES];
607 u64 lbr_to[MAX_LBR_ENTRIES];
608 int lbr_callstack_users;
609 int lbr_stack_state;
610};
611
518#define x86_add_quirk(func_) \ 612#define x86_add_quirk(func_) \
519do { \ 613do { \
520 static struct x86_pmu_quirk __quirk __initdata = { \ 614 static struct x86_pmu_quirk __quirk __initdata = { \
@@ -524,8 +618,13 @@ do { \
524 x86_pmu.quirks = &__quirk; \ 618 x86_pmu.quirks = &__quirk; \
525} while (0) 619} while (0)
526 620
527#define ERF_NO_HT_SHARING 1 621/*
528#define ERF_HAS_RSP_1 2 622 * x86_pmu flags
623 */
624#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */
625#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */
626#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
627#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
529 628
530#define EVENT_VAR(_id) event_attr_##_id 629#define EVENT_VAR(_id) event_attr_##_id
531#define EVENT_PTR(_id) &event_attr_##_id.attr.attr 630#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
546 645
547extern struct x86_pmu x86_pmu __read_mostly; 646extern struct x86_pmu x86_pmu __read_mostly;
548 647
648static inline bool x86_pmu_has_lbr_callstack(void)
649{
650 return x86_pmu.lbr_sel_map &&
651 x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
652}
653
549DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); 654DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
550 655
551int x86_perf_event_set_period(struct perf_event *event); 656int x86_perf_event_set_period(struct perf_event *event);
@@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index)
588 return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; 693 return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
589} 694}
590 695
696int x86_add_exclusive(unsigned int what);
697
698void x86_del_exclusive(unsigned int what);
699
700void hw_perf_lbr_event_destroy(struct perf_event *event);
701
591int x86_setup_perfctr(struct perf_event *event); 702int x86_setup_perfctr(struct perf_event *event);
592 703
593int x86_pmu_hw_config(struct perf_event *event); 704int x86_pmu_hw_config(struct perf_event *event);
@@ -674,10 +785,34 @@ static inline int amd_pmu_init(void)
674 785
675#ifdef CONFIG_CPU_SUP_INTEL 786#ifdef CONFIG_CPU_SUP_INTEL
676 787
788static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
789{
790 /* user explicitly requested branch sampling */
791 if (has_branch_stack(event))
792 return true;
793
794 /* implicit branch sampling to correct PEBS skid */
795 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
796 x86_pmu.intel_cap.pebs_format < 2)
797 return true;
798
799 return false;
800}
801
802static inline bool intel_pmu_has_bts(struct perf_event *event)
803{
804 if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
805 !event->attr.freq && event->hw.sample_period == 1)
806 return true;
807
808 return false;
809}
810
677int intel_pmu_save_and_restart(struct perf_event *event); 811int intel_pmu_save_and_restart(struct perf_event *event);
678 812
679struct event_constraint * 813struct event_constraint *
680x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); 814x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
815 struct perf_event *event);
681 816
682struct intel_shared_regs *allocate_shared_regs(int cpu); 817struct intel_shared_regs *allocate_shared_regs(int cpu);
683 818
@@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void);
727 862
728void intel_ds_init(void); 863void intel_ds_init(void);
729 864
865void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
866
730void intel_pmu_lbr_reset(void); 867void intel_pmu_lbr_reset(void);
731 868
732void intel_pmu_lbr_enable(struct perf_event *event); 869void intel_pmu_lbr_enable(struct perf_event *event);
733 870
734void intel_pmu_lbr_disable(struct perf_event *event); 871void intel_pmu_lbr_disable(struct perf_event *event);
735 872
736void intel_pmu_lbr_enable_all(void); 873void intel_pmu_lbr_enable_all(bool pmi);
737 874
738void intel_pmu_lbr_disable_all(void); 875void intel_pmu_lbr_disable_all(void);
739 876
@@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void);
747 884
748void intel_pmu_lbr_init_snb(void); 885void intel_pmu_lbr_init_snb(void);
749 886
887void intel_pmu_lbr_init_hsw(void);
888
750int intel_pmu_setup_lbr_filter(struct perf_event *event); 889int intel_pmu_setup_lbr_filter(struct perf_event *event);
751 890
891void intel_pt_interrupt(void);
892
893int intel_bts_interrupt(void);
894
895void intel_bts_enable_local(void);
896
897void intel_bts_disable_local(void);
898
752int p4_pmu_init(void); 899int p4_pmu_init(void);
753 900
754int p6_pmu_init(void); 901int p6_pmu_init(void);
@@ -758,6 +905,10 @@ int knc_pmu_init(void);
758ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, 905ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
759 char *page); 906 char *page);
760 907
908static inline int is_ht_workaround_enabled(void)
909{
910 return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
911}
761#else /* CONFIG_CPU_SUP_INTEL */ 912#else /* CONFIG_CPU_SUP_INTEL */
762 913
763static inline void reserve_ds_buffers(void) 914static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 28926311aac1..1cee5d2d7ece 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
382static void amd_pmu_cpu_starting(int cpu) 382static void amd_pmu_cpu_starting(int cpu)
383{ 383{
384 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 384 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
385 void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
385 struct amd_nb *nb; 386 struct amd_nb *nb;
386 int i, nb_id; 387 int i, nb_id;
387 388
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
399 continue; 400 continue;
400 401
401 if (nb->nb_id == nb_id) { 402 if (nb->nb_id == nb_id) {
402 cpuc->kfree_on_online = cpuc->amd_nb; 403 *onln = cpuc->amd_nb;
403 cpuc->amd_nb = nb; 404 cpuc->amd_nb = nb;
404 break; 405 break;
405 } 406 }
@@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
429} 430}
430 431
431static struct event_constraint * 432static struct event_constraint *
432amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 433amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
434 struct perf_event *event)
433{ 435{
434 /* 436 /*
435 * if not NB event or no NB, then no constraints 437 * if not NB event or no NB, then no constraints
@@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
537static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); 539static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
538 540
539static struct event_constraint * 541static struct event_constraint *
540amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) 542amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
543 struct perf_event *event)
541{ 544{
542 struct hw_perf_event *hwc = &event->hw; 545 struct hw_perf_event *hwc = &event->hw;
543 unsigned int event_code = amd_get_event_code(hwc); 546 unsigned int event_code = amd_get_event_code(hwc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a61f5c6911da..989d3c215d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
796 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that 796 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
797 * is using the new offset. 797 * is using the new offset.
798 */ 798 */
799static int force_ibs_eilvt_setup(void) 799static void force_ibs_eilvt_setup(void)
800{ 800{
801 int offset; 801 int offset;
802 int ret; 802 int ret;
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
811 811
812 if (offset == APIC_EILVT_NR_MAX) { 812 if (offset == APIC_EILVT_NR_MAX) {
813 printk(KERN_DEBUG "No EILVT entry available\n"); 813 printk(KERN_DEBUG "No EILVT entry available\n");
814 return -EBUSY; 814 return;
815 } 815 }
816 816
817 ret = setup_ibs_ctl(offset); 817 ret = setup_ibs_ctl(offset);
818 if (ret) 818 if (ret)
819 goto out; 819 goto out;
820 820
821 if (!ibs_eilvt_valid()) { 821 if (!ibs_eilvt_valid())
822 ret = -EFAULT;
823 goto out; 822 goto out;
824 }
825 823
826 pr_info("IBS: LVT offset %d assigned\n", offset); 824 pr_info("IBS: LVT offset %d assigned\n", offset);
827 825
828 return 0; 826 return;
829out: 827out:
830 preempt_disable(); 828 preempt_disable();
831 put_eilvt(offset); 829 put_eilvt(offset);
832 preempt_enable(); 830 preempt_enable();
833 return ret; 831 return;
834} 832}
835 833
836static void ibs_eilvt_setup(void) 834static void ibs_eilvt_setup(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 258990688a5e..219d3fb423a1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/watchdog.h>
15 16
16#include <asm/cpufeature.h> 17#include <asm/cpufeature.h>
17#include <asm/hardirq.h> 18#include <asm/hardirq.h>
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
113 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 114 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
114 INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ 115 INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
115 INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ 116 INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
117
118 INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
119 INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
120 INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
121 INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
122
116 EVENT_CONSTRAINT_END 123 EVENT_CONSTRAINT_END
117}; 124};
118 125
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
131 INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ 138 INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
132 INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ 139 INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
133 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 140 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
134 /* 141
135 * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT 142 INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
136 * siblings; disable these events because they can corrupt unrelated 143 INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
137 * counters. 144 INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
138 */ 145 INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
139 INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */ 146
140 INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
141 INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
142 INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
143 EVENT_CONSTRAINT_END 147 EVENT_CONSTRAINT_END
144}; 148};
145 149
@@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = {
217 INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), 221 INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
218 /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ 222 /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
219 INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), 223 INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
224
225 INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
226 INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
227 INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
228 INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
229
230 EVENT_CONSTRAINT_END
231};
232
233struct event_constraint intel_bdw_event_constraints[] = {
234 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
235 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
236 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
237 INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
238 INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
220 EVENT_CONSTRAINT_END 239 EVENT_CONSTRAINT_END
221}; 240};
222 241
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids
415 434
416}; 435};
417 436
437/*
438 * Notes on the events:
439 * - data reads do not include code reads (comparable to earlier tables)
440 * - data counts include speculative execution (except L1 write, dtlb, bpu)
441 * - remote node access includes remote memory, remote cache, remote mmio.
442 * - prefetches are not included in the counts because they are not
443 * reliably counted.
444 */
445
446#define HSW_DEMAND_DATA_RD BIT_ULL(0)
447#define HSW_DEMAND_RFO BIT_ULL(1)
448#define HSW_ANY_RESPONSE BIT_ULL(16)
449#define HSW_SUPPLIER_NONE BIT_ULL(17)
450#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
451#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
452#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
453#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
454#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
455 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
456 HSW_L3_MISS_REMOTE_HOP2P)
457#define HSW_SNOOP_NONE BIT_ULL(31)
458#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
459#define HSW_SNOOP_MISS BIT_ULL(33)
460#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
461#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
462#define HSW_SNOOP_HITM BIT_ULL(36)
463#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
464#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
465 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
466 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
467 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
468#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
469#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
470#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
471#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
472 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
473#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
474
475#define BDW_L3_MISS_LOCAL BIT(26)
476#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
477 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
478 HSW_L3_MISS_REMOTE_HOP2P)
479
480
481static __initconst const u64 hsw_hw_cache_event_ids
482 [PERF_COUNT_HW_CACHE_MAX]
483 [PERF_COUNT_HW_CACHE_OP_MAX]
484 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
485{
486 [ C(L1D ) ] = {
487 [ C(OP_READ) ] = {
488 [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
489 [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
490 },
491 [ C(OP_WRITE) ] = {
492 [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
493 [ C(RESULT_MISS) ] = 0x0,
494 },
495 [ C(OP_PREFETCH) ] = {
496 [ C(RESULT_ACCESS) ] = 0x0,
497 [ C(RESULT_MISS) ] = 0x0,
498 },
499 },
500 [ C(L1I ) ] = {
501 [ C(OP_READ) ] = {
502 [ C(RESULT_ACCESS) ] = 0x0,
503 [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
504 },
505 [ C(OP_WRITE) ] = {
506 [ C(RESULT_ACCESS) ] = -1,
507 [ C(RESULT_MISS) ] = -1,
508 },
509 [ C(OP_PREFETCH) ] = {
510 [ C(RESULT_ACCESS) ] = 0x0,
511 [ C(RESULT_MISS) ] = 0x0,
512 },
513 },
514 [ C(LL ) ] = {
515 [ C(OP_READ) ] = {
516 [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
517 [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
518 },
519 [ C(OP_WRITE) ] = {
520 [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
521 [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
522 },
523 [ C(OP_PREFETCH) ] = {
524 [ C(RESULT_ACCESS) ] = 0x0,
525 [ C(RESULT_MISS) ] = 0x0,
526 },
527 },
528 [ C(DTLB) ] = {
529 [ C(OP_READ) ] = {
530 [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
531 [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
532 },
533 [ C(OP_WRITE) ] = {
534 [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
535 [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
536 },
537 [ C(OP_PREFETCH) ] = {
538 [ C(RESULT_ACCESS) ] = 0x0,
539 [ C(RESULT_MISS) ] = 0x0,
540 },
541 },
542 [ C(ITLB) ] = {
543 [ C(OP_READ) ] = {
544 [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
545 [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
546 },
547 [ C(OP_WRITE) ] = {
548 [ C(RESULT_ACCESS) ] = -1,
549 [ C(RESULT_MISS) ] = -1,
550 },
551 [ C(OP_PREFETCH) ] = {
552 [ C(RESULT_ACCESS) ] = -1,
553 [ C(RESULT_MISS) ] = -1,
554 },
555 },
556 [ C(BPU ) ] = {
557 [ C(OP_READ) ] = {
558 [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
559 [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
560 },
561 [ C(OP_WRITE) ] = {
562 [ C(RESULT_ACCESS) ] = -1,
563 [ C(RESULT_MISS) ] = -1,
564 },
565 [ C(OP_PREFETCH) ] = {
566 [ C(RESULT_ACCESS) ] = -1,
567 [ C(RESULT_MISS) ] = -1,
568 },
569 },
570 [ C(NODE) ] = {
571 [ C(OP_READ) ] = {
572 [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
573 [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
574 },
575 [ C(OP_WRITE) ] = {
576 [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
577 [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
578 },
579 [ C(OP_PREFETCH) ] = {
580 [ C(RESULT_ACCESS) ] = 0x0,
581 [ C(RESULT_MISS) ] = 0x0,
582 },
583 },
584};
585
586static __initconst const u64 hsw_hw_cache_extra_regs
587 [PERF_COUNT_HW_CACHE_MAX]
588 [PERF_COUNT_HW_CACHE_OP_MAX]
589 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
590{
591 [ C(LL ) ] = {
592 [ C(OP_READ) ] = {
593 [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
594 HSW_LLC_ACCESS,
595 [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
596 HSW_L3_MISS|HSW_ANY_SNOOP,
597 },
598 [ C(OP_WRITE) ] = {
599 [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
600 HSW_LLC_ACCESS,
601 [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
602 HSW_L3_MISS|HSW_ANY_SNOOP,
603 },
604 [ C(OP_PREFETCH) ] = {
605 [ C(RESULT_ACCESS) ] = 0x0,
606 [ C(RESULT_MISS) ] = 0x0,
607 },
608 },
609 [ C(NODE) ] = {
610 [ C(OP_READ) ] = {
611 [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
612 HSW_L3_MISS_LOCAL_DRAM|
613 HSW_SNOOP_DRAM,
614 [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
615 HSW_L3_MISS_REMOTE|
616 HSW_SNOOP_DRAM,
617 },
618 [ C(OP_WRITE) ] = {
619 [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
620 HSW_L3_MISS_LOCAL_DRAM|
621 HSW_SNOOP_DRAM,
622 [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
623 HSW_L3_MISS_REMOTE|
624 HSW_SNOOP_DRAM,
625 },
626 [ C(OP_PREFETCH) ] = {
627 [ C(RESULT_ACCESS) ] = 0x0,
628 [ C(RESULT_MISS) ] = 0x0,
629 },
630 },
631};
632
418static __initconst const u64 westmere_hw_cache_event_ids 633static __initconst const u64 westmere_hw_cache_event_ids
419 [PERF_COUNT_HW_CACHE_MAX] 634 [PERF_COUNT_HW_CACHE_MAX]
420 [PERF_COUNT_HW_CACHE_OP_MAX] 635 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
1029 }, 1244 },
1030}; 1245};
1031 1246
1032static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) 1247/*
1033{ 1248 * Use from PMIs where the LBRs are already disabled.
1034 /* user explicitly requested branch sampling */ 1249 */
1035 if (has_branch_stack(event)) 1250static void __intel_pmu_disable_all(void)
1036 return true;
1037
1038 /* implicit branch sampling to correct PEBS skid */
1039 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
1040 x86_pmu.intel_cap.pebs_format < 2)
1041 return true;
1042
1043 return false;
1044}
1045
1046static void intel_pmu_disable_all(void)
1047{ 1251{
1048 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1252 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1049 1253
@@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void)
1051 1255
1052 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 1256 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1053 intel_pmu_disable_bts(); 1257 intel_pmu_disable_bts();
1258 else
1259 intel_bts_disable_local();
1054 1260
1055 intel_pmu_pebs_disable_all(); 1261 intel_pmu_pebs_disable_all();
1262}
1263
1264static void intel_pmu_disable_all(void)
1265{
1266 __intel_pmu_disable_all();
1056 intel_pmu_lbr_disable_all(); 1267 intel_pmu_lbr_disable_all();
1057} 1268}
1058 1269
1059static void intel_pmu_enable_all(int added) 1270static void __intel_pmu_enable_all(int added, bool pmi)
1060{ 1271{
1061 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1272 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1062 1273
1063 intel_pmu_pebs_enable_all(); 1274 intel_pmu_pebs_enable_all();
1064 intel_pmu_lbr_enable_all(); 1275 intel_pmu_lbr_enable_all(pmi);
1065 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 1276 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
1066 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); 1277 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
1067 1278
@@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added)
1073 return; 1284 return;
1074 1285
1075 intel_pmu_enable_bts(event->hw.config); 1286 intel_pmu_enable_bts(event->hw.config);
1076 } 1287 } else
1288 intel_bts_enable_local();
1289}
1290
1291static void intel_pmu_enable_all(int added)
1292{
1293 __intel_pmu_enable_all(added, false);
1077} 1294}
1078 1295
1079/* 1296/*
@@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
1207 * must disable before any actual event 1424 * must disable before any actual event
1208 * because any event may be combined with LBR 1425 * because any event may be combined with LBR
1209 */ 1426 */
1210 if (intel_pmu_needs_lbr_smpl(event)) 1427 if (needs_branch_stack(event))
1211 intel_pmu_lbr_disable(event); 1428 intel_pmu_lbr_disable(event);
1212 1429
1213 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1430 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
1268 * must enabled before any actual event 1485 * must enabled before any actual event
1269 * because any event may be combined with LBR 1486 * because any event may be combined with LBR
1270 */ 1487 */
1271 if (intel_pmu_needs_lbr_smpl(event)) 1488 if (needs_branch_stack(event))
1272 intel_pmu_lbr_enable(event); 1489 intel_pmu_lbr_enable(event);
1273 1490
1274 if (event->attr.exclude_host) 1491 if (event->attr.exclude_host)
@@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void)
1334 if (ds) 1551 if (ds)
1335 ds->bts_index = ds->bts_buffer_base; 1552 ds->bts_index = ds->bts_buffer_base;
1336 1553
1554 /* Ack all overflows and disable fixed counters */
1555 if (x86_pmu.version >= 2) {
1556 intel_pmu_ack_status(intel_pmu_get_status());
1557 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
1558 }
1559
1560 /* Reset LBRs and LBR freezing */
1561 if (x86_pmu.lbr_nr) {
1562 update_debugctlmsr(get_debugctlmsr() &
1563 ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
1564 }
1565
1337 local_irq_restore(flags); 1566 local_irq_restore(flags);
1338} 1567}
1339 1568
@@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1357 */ 1586 */
1358 if (!x86_pmu.late_ack) 1587 if (!x86_pmu.late_ack)
1359 apic_write(APIC_LVTPC, APIC_DM_NMI); 1588 apic_write(APIC_LVTPC, APIC_DM_NMI);
1360 intel_pmu_disable_all(); 1589 __intel_pmu_disable_all();
1361 handled = intel_pmu_drain_bts_buffer(); 1590 handled = intel_pmu_drain_bts_buffer();
1591 handled += intel_bts_interrupt();
1362 status = intel_pmu_get_status(); 1592 status = intel_pmu_get_status();
1363 if (!status) 1593 if (!status)
1364 goto done; 1594 goto done;
@@ -1399,6 +1629,14 @@ again:
1399 } 1629 }
1400 1630
1401 /* 1631 /*
1632 * Intel PT
1633 */
1634 if (__test_and_clear_bit(55, (unsigned long *)&status)) {
1635 handled++;
1636 intel_pt_interrupt();
1637 }
1638
1639 /*
1402 * Checkpointed counters can lead to 'spurious' PMIs because the 1640 * Checkpointed counters can lead to 'spurious' PMIs because the
1403 * rollback caused by the PMI will have cleared the overflow status 1641 * rollback caused by the PMI will have cleared the overflow status
1404 * bit. Therefore always force probe these counters. 1642 * bit. Therefore always force probe these counters.
@@ -1433,7 +1671,7 @@ again:
1433 goto again; 1671 goto again;
1434 1672
1435done: 1673done:
1436 intel_pmu_enable_all(0); 1674 __intel_pmu_enable_all(0, true);
1437 /* 1675 /*
1438 * Only unmask the NMI after the overflow counters 1676 * Only unmask the NMI after the overflow counters
1439 * have been reset. This avoids spurious NMIs on 1677 * have been reset. This avoids spurious NMIs on
@@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event)
1464 1702
1465static int intel_alt_er(int idx) 1703static int intel_alt_er(int idx)
1466{ 1704{
1467 if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) 1705 if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
1468 return idx; 1706 return idx;
1469 1707
1470 if (idx == EXTRA_REG_RSP_0) 1708 if (idx == EXTRA_REG_RSP_0)
@@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
1624} 1862}
1625 1863
1626struct event_constraint * 1864struct event_constraint *
1627x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1865x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
1866 struct perf_event *event)
1628{ 1867{
1629 struct event_constraint *c; 1868 struct event_constraint *c;
1630 1869
@@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1641} 1880}
1642 1881
1643static struct event_constraint * 1882static struct event_constraint *
1644intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1883__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
1884 struct perf_event *event)
1645{ 1885{
1646 struct event_constraint *c; 1886 struct event_constraint *c;
1647 1887
@@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
1657 if (c) 1897 if (c)
1658 return c; 1898 return c;
1659 1899
1660 return x86_get_event_constraints(cpuc, event); 1900 return x86_get_event_constraints(cpuc, idx, event);
1901}
1902
1903static void
1904intel_start_scheduling(struct cpu_hw_events *cpuc)
1905{
1906 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
1907 struct intel_excl_states *xl, *xlo;
1908 int tid = cpuc->excl_thread_id;
1909 int o_tid = 1 - tid; /* sibling thread */
1910
1911 /*
1912 * nothing needed if in group validation mode
1913 */
1914 if (cpuc->is_fake || !is_ht_workaround_enabled())
1915 return;
1916
1917 /*
1918 * no exclusion needed
1919 */
1920 if (!excl_cntrs)
1921 return;
1922
1923 xlo = &excl_cntrs->states[o_tid];
1924 xl = &excl_cntrs->states[tid];
1925
1926 xl->sched_started = true;
1927 xl->num_alloc_cntrs = 0;
1928 /*
1929 * lock shared state until we are done scheduling
1930 * in stop_event_scheduling()
1931 * makes scheduling appear as a transaction
1932 */
1933 WARN_ON_ONCE(!irqs_disabled());
1934 raw_spin_lock(&excl_cntrs->lock);
1935
1936 /*
1937 * save initial state of sibling thread
1938 */
1939 memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
1940}
1941
1942static void
1943intel_stop_scheduling(struct cpu_hw_events *cpuc)
1944{
1945 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
1946 struct intel_excl_states *xl, *xlo;
1947 int tid = cpuc->excl_thread_id;
1948 int o_tid = 1 - tid; /* sibling thread */
1949
1950 /*
1951 * nothing needed if in group validation mode
1952 */
1953 if (cpuc->is_fake || !is_ht_workaround_enabled())
1954 return;
1955 /*
1956 * no exclusion needed
1957 */
1958 if (!excl_cntrs)
1959 return;
1960
1961 xlo = &excl_cntrs->states[o_tid];
1962 xl = &excl_cntrs->states[tid];
1963
1964 /*
1965 * make new sibling thread state visible
1966 */
1967 memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
1968
1969 xl->sched_started = false;
1970 /*
1971 * release shared state lock (acquired in intel_start_scheduling())
1972 */
1973 raw_spin_unlock(&excl_cntrs->lock);
1974}
1975
1976static struct event_constraint *
1977intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
1978 int idx, struct event_constraint *c)
1979{
1980 struct event_constraint *cx;
1981 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
1982 struct intel_excl_states *xl, *xlo;
1983 int is_excl, i;
1984 int tid = cpuc->excl_thread_id;
1985 int o_tid = 1 - tid; /* alternate */
1986
1987 /*
1988 * validating a group does not require
1989 * enforcing cross-thread exclusion
1990 */
1991 if (cpuc->is_fake || !is_ht_workaround_enabled())
1992 return c;
1993
1994 /*
1995 * no exclusion needed
1996 */
1997 if (!excl_cntrs)
1998 return c;
1999 /*
2000 * event requires exclusive counter access
2001 * across HT threads
2002 */
2003 is_excl = c->flags & PERF_X86_EVENT_EXCL;
2004
2005 /*
2006 * xl = state of current HT
2007 * xlo = state of sibling HT
2008 */
2009 xl = &excl_cntrs->states[tid];
2010 xlo = &excl_cntrs->states[o_tid];
2011
2012 /*
2013 * do not allow scheduling of more than max_alloc_cntrs
2014 * which is set to half the available generic counters.
2015 * this helps avoid counter starvation of sibling thread
2016 * by ensuring at most half the counters cannot be in
2017 * exclusive mode. There is not designated counters for the
2018 * limits. Any N/2 counters can be used. This helps with
2019 * events with specifix counter constraints
2020 */
2021 if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
2022 return &emptyconstraint;
2023
2024 cx = c;
2025
2026 /*
2027 * because we modify the constraint, we need
2028 * to make a copy. Static constraints come
2029 * from static const tables.
2030 *
2031 * only needed when constraint has not yet
2032 * been cloned (marked dynamic)
2033 */
2034 if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
2035
2036 /* sanity check */
2037 if (idx < 0)
2038 return &emptyconstraint;
2039
2040 /*
2041 * grab pre-allocated constraint entry
2042 */
2043 cx = &cpuc->constraint_list[idx];
2044
2045 /*
2046 * initialize dynamic constraint
2047 * with static constraint
2048 */
2049 memcpy(cx, c, sizeof(*cx));
2050
2051 /*
2052 * mark constraint as dynamic, so we
2053 * can free it later on
2054 */
2055 cx->flags |= PERF_X86_EVENT_DYNAMIC;
2056 }
2057
2058 /*
2059 * From here on, the constraint is dynamic.
2060 * Either it was just allocated above, or it
2061 * was allocated during a earlier invocation
2062 * of this function
2063 */
2064
2065 /*
2066 * Modify static constraint with current dynamic
2067 * state of thread
2068 *
2069 * EXCLUSIVE: sibling counter measuring exclusive event
2070 * SHARED : sibling counter measuring non-exclusive event
2071 * UNUSED : sibling counter unused
2072 */
2073 for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
2074 /*
2075 * exclusive event in sibling counter
2076 * our corresponding counter cannot be used
2077 * regardless of our event
2078 */
2079 if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
2080 __clear_bit(i, cx->idxmsk);
2081 /*
2082 * if measuring an exclusive event, sibling
2083 * measuring non-exclusive, then counter cannot
2084 * be used
2085 */
2086 if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
2087 __clear_bit(i, cx->idxmsk);
2088 }
2089
2090 /*
2091 * recompute actual bit weight for scheduling algorithm
2092 */
2093 cx->weight = hweight64(cx->idxmsk64);
2094
2095 /*
2096 * if we return an empty mask, then switch
2097 * back to static empty constraint to avoid
2098 * the cost of freeing later on
2099 */
2100 if (cx->weight == 0)
2101 cx = &emptyconstraint;
2102
2103 return cx;
2104}
2105
2106static struct event_constraint *
2107intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
2108 struct perf_event *event)
2109{
2110 struct event_constraint *c1 = event->hw.constraint;
2111 struct event_constraint *c2;
2112
2113 /*
2114 * first time only
2115 * - static constraint: no change across incremental scheduling calls
2116 * - dynamic constraint: handled by intel_get_excl_constraints()
2117 */
2118 c2 = __intel_get_event_constraints(cpuc, idx, event);
2119 if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
2120 bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
2121 c1->weight = c2->weight;
2122 c2 = c1;
2123 }
2124
2125 if (cpuc->excl_cntrs)
2126 return intel_get_excl_constraints(cpuc, event, idx, c2);
2127
2128 return c2;
2129}
2130
2131static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
2132 struct perf_event *event)
2133{
2134 struct hw_perf_event *hwc = &event->hw;
2135 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
2136 struct intel_excl_states *xlo, *xl;
2137 unsigned long flags = 0; /* keep compiler happy */
2138 int tid = cpuc->excl_thread_id;
2139 int o_tid = 1 - tid;
2140
2141 /*
2142 * nothing needed if in group validation mode
2143 */
2144 if (cpuc->is_fake)
2145 return;
2146
2147 WARN_ON_ONCE(!excl_cntrs);
2148
2149 if (!excl_cntrs)
2150 return;
2151
2152 xl = &excl_cntrs->states[tid];
2153 xlo = &excl_cntrs->states[o_tid];
2154
2155 /*
2156 * put_constraint may be called from x86_schedule_events()
2157 * which already has the lock held so here make locking
2158 * conditional
2159 */
2160 if (!xl->sched_started)
2161 raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
2162
2163 /*
2164 * if event was actually assigned, then mark the
2165 * counter state as unused now
2166 */
2167 if (hwc->idx >= 0)
2168 xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
2169
2170 if (!xl->sched_started)
2171 raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
1661} 2172}
1662 2173
1663static void 2174static void
@@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1678static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 2189static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1679 struct perf_event *event) 2190 struct perf_event *event)
1680{ 2191{
2192 struct event_constraint *c = event->hw.constraint;
2193
1681 intel_put_shared_regs_event_constraints(cpuc, event); 2194 intel_put_shared_regs_event_constraints(cpuc, event);
2195
2196 /*
2197 * is PMU has exclusive counter restrictions, then
2198 * all events are subject to and must call the
2199 * put_excl_constraints() routine
2200 */
2201 if (c && cpuc->excl_cntrs)
2202 intel_put_excl_constraints(cpuc, event);
2203
2204 /* cleanup dynamic constraint */
2205 if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
2206 event->hw.constraint = NULL;
2207}
2208
2209static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
2210 struct perf_event *event, int cntr)
2211{
2212 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
2213 struct event_constraint *c = event->hw.constraint;
2214 struct intel_excl_states *xlo, *xl;
2215 int tid = cpuc->excl_thread_id;
2216 int o_tid = 1 - tid;
2217 int is_excl;
2218
2219 if (cpuc->is_fake || !c)
2220 return;
2221
2222 is_excl = c->flags & PERF_X86_EVENT_EXCL;
2223
2224 if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
2225 return;
2226
2227 WARN_ON_ONCE(!excl_cntrs);
2228
2229 if (!excl_cntrs)
2230 return;
2231
2232 xl = &excl_cntrs->states[tid];
2233 xlo = &excl_cntrs->states[o_tid];
2234
2235 WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
2236
2237 if (cntr >= 0) {
2238 if (is_excl)
2239 xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
2240 else
2241 xlo->init_state[cntr] = INTEL_EXCL_SHARED;
2242 }
1682} 2243}
1683 2244
1684static void intel_pebs_aliases_core2(struct perf_event *event) 2245static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event)
1747 if (event->attr.precise_ip && x86_pmu.pebs_aliases) 2308 if (event->attr.precise_ip && x86_pmu.pebs_aliases)
1748 x86_pmu.pebs_aliases(event); 2309 x86_pmu.pebs_aliases(event);
1749 2310
1750 if (intel_pmu_needs_lbr_smpl(event)) { 2311 if (needs_branch_stack(event)) {
1751 ret = intel_pmu_setup_lbr_filter(event); 2312 ret = intel_pmu_setup_lbr_filter(event);
1752 if (ret) 2313 if (ret)
1753 return ret; 2314 return ret;
2315
2316 /*
2317 * BTS is set up earlier in this path, so don't account twice
2318 */
2319 if (!intel_pmu_has_bts(event)) {
2320 /* disallow lbr if conflicting events are present */
2321 if (x86_add_exclusive(x86_lbr_exclusive_lbr))
2322 return -EBUSY;
2323
2324 event->destroy = hw_perf_lbr_event_destroy;
2325 }
1754 } 2326 }
1755 2327
1756 if (event->attr.type != PERF_TYPE_RAW) 2328 if (event->attr.type != PERF_TYPE_RAW)
@@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint =
1891 EVENT_CONSTRAINT(0, 0x4, 0); 2463 EVENT_CONSTRAINT(0, 0x4, 0);
1892 2464
1893static struct event_constraint * 2465static struct event_constraint *
1894hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 2466hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
2467 struct perf_event *event)
1895{ 2468{
1896 struct event_constraint *c = intel_get_event_constraints(cpuc, event); 2469 struct event_constraint *c;
2470
2471 c = intel_get_event_constraints(cpuc, idx, event);
1897 2472
1898 /* Handle special quirk on in_tx_checkpointed only in counter 2 */ 2473 /* Handle special quirk on in_tx_checkpointed only in counter 2 */
1899 if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { 2474 if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
@@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1905 return c; 2480 return c;
1906} 2481}
1907 2482
2483/*
2484 * Broadwell:
2485 *
2486 * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
2487 * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
2488 * the two to enforce a minimum period of 128 (the smallest value that has bits
2489 * 0-5 cleared and >= 100).
2490 *
2491 * Because of how the code in x86_perf_event_set_period() works, the truncation
2492 * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
2493 * to make up for the 'lost' events due to carrying the 'error' in period_left.
2494 *
2495 * Therefore the effective (average) period matches the requested period,
2496 * despite coarser hardware granularity.
2497 */
2498static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
2499{
2500 if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
2501 X86_CONFIG(.event=0xc0, .umask=0x01)) {
2502 if (left < 128)
2503 left = 128;
2504 left &= ~0x3fu;
2505 }
2506 return left;
2507}
2508
1908PMU_FORMAT_ATTR(event, "config:0-7" ); 2509PMU_FORMAT_ATTR(event, "config:0-7" );
1909PMU_FORMAT_ATTR(umask, "config:8-15" ); 2510PMU_FORMAT_ATTR(umask, "config:8-15" );
1910PMU_FORMAT_ATTR(edge, "config:18" ); 2511PMU_FORMAT_ATTR(edge, "config:18" );
@@ -1979,16 +2580,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
1979 return regs; 2580 return regs;
1980} 2581}
1981 2582
2583static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
2584{
2585 struct intel_excl_cntrs *c;
2586 int i;
2587
2588 c = kzalloc_node(sizeof(struct intel_excl_cntrs),
2589 GFP_KERNEL, cpu_to_node(cpu));
2590 if (c) {
2591 raw_spin_lock_init(&c->lock);
2592 for (i = 0; i < X86_PMC_IDX_MAX; i++) {
2593 c->states[0].state[i] = INTEL_EXCL_UNUSED;
2594 c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
2595
2596 c->states[1].state[i] = INTEL_EXCL_UNUSED;
2597 c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
2598 }
2599 c->core_id = -1;
2600 }
2601 return c;
2602}
2603
1982static int intel_pmu_cpu_prepare(int cpu) 2604static int intel_pmu_cpu_prepare(int cpu)
1983{ 2605{
1984 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 2606 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1985 2607
1986 if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) 2608 if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
1987 return NOTIFY_OK; 2609 cpuc->shared_regs = allocate_shared_regs(cpu);
2610 if (!cpuc->shared_regs)
2611 return NOTIFY_BAD;
2612 }
1988 2613
1989 cpuc->shared_regs = allocate_shared_regs(cpu); 2614 if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
1990 if (!cpuc->shared_regs) 2615 size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
1991 return NOTIFY_BAD; 2616
2617 cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
2618 if (!cpuc->constraint_list)
2619 return NOTIFY_BAD;
2620
2621 cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
2622 if (!cpuc->excl_cntrs) {
2623 kfree(cpuc->constraint_list);
2624 kfree(cpuc->shared_regs);
2625 return NOTIFY_BAD;
2626 }
2627 cpuc->excl_thread_id = 0;
2628 }
1992 2629
1993 return NOTIFY_OK; 2630 return NOTIFY_OK;
1994} 2631}
@@ -2010,13 +2647,15 @@ static void intel_pmu_cpu_starting(int cpu)
2010 if (!cpuc->shared_regs) 2647 if (!cpuc->shared_regs)
2011 return; 2648 return;
2012 2649
2013 if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { 2650 if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
2651 void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
2652
2014 for_each_cpu(i, topology_thread_cpumask(cpu)) { 2653 for_each_cpu(i, topology_thread_cpumask(cpu)) {
2015 struct intel_shared_regs *pc; 2654 struct intel_shared_regs *pc;
2016 2655
2017 pc = per_cpu(cpu_hw_events, i).shared_regs; 2656 pc = per_cpu(cpu_hw_events, i).shared_regs;
2018 if (pc && pc->core_id == core_id) { 2657 if (pc && pc->core_id == core_id) {
2019 cpuc->kfree_on_online = cpuc->shared_regs; 2658 *onln = cpuc->shared_regs;
2020 cpuc->shared_regs = pc; 2659 cpuc->shared_regs = pc;
2021 break; 2660 break;
2022 } 2661 }
@@ -2027,6 +2666,44 @@ static void intel_pmu_cpu_starting(int cpu)
2027 2666
2028 if (x86_pmu.lbr_sel_map) 2667 if (x86_pmu.lbr_sel_map)
2029 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; 2668 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
2669
2670 if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
2671 int h = x86_pmu.num_counters >> 1;
2672
2673 for_each_cpu(i, topology_thread_cpumask(cpu)) {
2674 struct intel_excl_cntrs *c;
2675
2676 c = per_cpu(cpu_hw_events, i).excl_cntrs;
2677 if (c && c->core_id == core_id) {
2678 cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
2679 cpuc->excl_cntrs = c;
2680 cpuc->excl_thread_id = 1;
2681 break;
2682 }
2683 }
2684 cpuc->excl_cntrs->core_id = core_id;
2685 cpuc->excl_cntrs->refcnt++;
2686 /*
2687 * set hard limit to half the number of generic counters
2688 */
2689 cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
2690 cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
2691 }
2692}
2693
2694static void free_excl_cntrs(int cpu)
2695{
2696 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
2697 struct intel_excl_cntrs *c;
2698
2699 c = cpuc->excl_cntrs;
2700 if (c) {
2701 if (c->core_id == -1 || --c->refcnt == 0)
2702 kfree(c);
2703 cpuc->excl_cntrs = NULL;
2704 kfree(cpuc->constraint_list);
2705 cpuc->constraint_list = NULL;
2706 }
2030} 2707}
2031 2708
2032static void intel_pmu_cpu_dying(int cpu) 2709static void intel_pmu_cpu_dying(int cpu)
@@ -2041,19 +2718,9 @@ static void intel_pmu_cpu_dying(int cpu)
2041 cpuc->shared_regs = NULL; 2718 cpuc->shared_regs = NULL;
2042 } 2719 }
2043 2720
2044 fini_debug_store_on_cpu(cpu); 2721 free_excl_cntrs(cpu);
2045}
2046 2722
2047static void intel_pmu_flush_branch_stack(void) 2723 fini_debug_store_on_cpu(cpu);
2048{
2049 /*
2050 * Intel LBR does not tag entries with the
2051 * PID of the current task, then we need to
2052 * flush it on ctxsw
2053 * For now, we simply reset it
2054 */
2055 if (x86_pmu.lbr_nr)
2056 intel_pmu_lbr_reset();
2057} 2724}
2058 2725
2059PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); 2726PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -2107,7 +2774,7 @@ static __initconst const struct x86_pmu intel_pmu = {
2107 .cpu_starting = intel_pmu_cpu_starting, 2774 .cpu_starting = intel_pmu_cpu_starting,
2108 .cpu_dying = intel_pmu_cpu_dying, 2775 .cpu_dying = intel_pmu_cpu_dying,
2109 .guest_get_msrs = intel_guest_get_msrs, 2776 .guest_get_msrs = intel_guest_get_msrs,
2110 .flush_branch_stack = intel_pmu_flush_branch_stack, 2777 .sched_task = intel_pmu_lbr_sched_task,
2111}; 2778};
2112 2779
2113static __init void intel_clovertown_quirk(void) 2780static __init void intel_clovertown_quirk(void)
@@ -2264,6 +2931,27 @@ static __init void intel_nehalem_quirk(void)
2264 } 2931 }
2265} 2932}
2266 2933
2934/*
2935 * enable software workaround for errata:
2936 * SNB: BJ122
2937 * IVB: BV98
2938 * HSW: HSD29
2939 *
2940 * Only needed when HT is enabled. However detecting
2941 * if HT is enabled is difficult (model specific). So instead,
2942 * we enable the workaround in the early boot, and verify if
2943 * it is needed in a later initcall phase once we have valid
2944 * topology information to check if HT is actually enabled
2945 */
2946static __init void intel_ht_bug(void)
2947{
2948 x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
2949
2950 x86_pmu.commit_scheduling = intel_commit_scheduling;
2951 x86_pmu.start_scheduling = intel_start_scheduling;
2952 x86_pmu.stop_scheduling = intel_stop_scheduling;
2953}
2954
2267EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); 2955EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
2268EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") 2956EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
2269 2957
@@ -2443,7 +3131,7 @@ __init int intel_pmu_init(void)
2443 x86_pmu.event_constraints = intel_slm_event_constraints; 3131 x86_pmu.event_constraints = intel_slm_event_constraints;
2444 x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; 3132 x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
2445 x86_pmu.extra_regs = intel_slm_extra_regs; 3133 x86_pmu.extra_regs = intel_slm_extra_regs;
2446 x86_pmu.er_flags |= ERF_HAS_RSP_1; 3134 x86_pmu.flags |= PMU_FL_HAS_RSP_1;
2447 pr_cont("Silvermont events, "); 3135 pr_cont("Silvermont events, ");
2448 break; 3136 break;
2449 3137
@@ -2461,7 +3149,7 @@ __init int intel_pmu_init(void)
2461 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 3149 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
2462 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; 3150 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
2463 x86_pmu.extra_regs = intel_westmere_extra_regs; 3151 x86_pmu.extra_regs = intel_westmere_extra_regs;
2464 x86_pmu.er_flags |= ERF_HAS_RSP_1; 3152 x86_pmu.flags |= PMU_FL_HAS_RSP_1;
2465 3153
2466 x86_pmu.cpu_events = nhm_events_attrs; 3154 x86_pmu.cpu_events = nhm_events_attrs;
2467 3155
@@ -2478,6 +3166,7 @@ __init int intel_pmu_init(void)
2478 case 42: /* 32nm SandyBridge */ 3166 case 42: /* 32nm SandyBridge */
2479 case 45: /* 32nm SandyBridge-E/EN/EP */ 3167 case 45: /* 32nm SandyBridge-E/EN/EP */
2480 x86_add_quirk(intel_sandybridge_quirk); 3168 x86_add_quirk(intel_sandybridge_quirk);
3169 x86_add_quirk(intel_ht_bug);
2481 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 3170 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
2482 sizeof(hw_cache_event_ids)); 3171 sizeof(hw_cache_event_ids));
2483 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, 3172 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2492,9 +3181,11 @@ __init int intel_pmu_init(void)
2492 x86_pmu.extra_regs = intel_snbep_extra_regs; 3181 x86_pmu.extra_regs = intel_snbep_extra_regs;
2493 else 3182 else
2494 x86_pmu.extra_regs = intel_snb_extra_regs; 3183 x86_pmu.extra_regs = intel_snb_extra_regs;
3184
3185
2495 /* all extra regs are per-cpu when HT is on */ 3186 /* all extra regs are per-cpu when HT is on */
2496 x86_pmu.er_flags |= ERF_HAS_RSP_1; 3187 x86_pmu.flags |= PMU_FL_HAS_RSP_1;
2497 x86_pmu.er_flags |= ERF_NO_HT_SHARING; 3188 x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
2498 3189
2499 x86_pmu.cpu_events = snb_events_attrs; 3190 x86_pmu.cpu_events = snb_events_attrs;
2500 3191
@@ -2510,6 +3201,7 @@ __init int intel_pmu_init(void)
2510 3201
2511 case 58: /* 22nm IvyBridge */ 3202 case 58: /* 22nm IvyBridge */
2512 case 62: /* 22nm IvyBridge-EP/EX */ 3203 case 62: /* 22nm IvyBridge-EP/EX */
3204 x86_add_quirk(intel_ht_bug);
2513 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 3205 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
2514 sizeof(hw_cache_event_ids)); 3206 sizeof(hw_cache_event_ids));
2515 /* dTLB-load-misses on IVB is different than SNB */ 3207 /* dTLB-load-misses on IVB is different than SNB */
@@ -2528,8 +3220,8 @@ __init int intel_pmu_init(void)
2528 else 3220 else
2529 x86_pmu.extra_regs = intel_snb_extra_regs; 3221 x86_pmu.extra_regs = intel_snb_extra_regs;
2530 /* all extra regs are per-cpu when HT is on */ 3222 /* all extra regs are per-cpu when HT is on */
2531 x86_pmu.er_flags |= ERF_HAS_RSP_1; 3223 x86_pmu.flags |= PMU_FL_HAS_RSP_1;
2532 x86_pmu.er_flags |= ERF_NO_HT_SHARING; 3224 x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
2533 3225
2534 x86_pmu.cpu_events = snb_events_attrs; 3226 x86_pmu.cpu_events = snb_events_attrs;
2535 3227
@@ -2545,19 +3237,20 @@ __init int intel_pmu_init(void)
2545 case 63: /* 22nm Haswell Server */ 3237 case 63: /* 22nm Haswell Server */
2546 case 69: /* 22nm Haswell ULT */ 3238 case 69: /* 22nm Haswell ULT */
2547 case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ 3239 case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
3240 x86_add_quirk(intel_ht_bug);
2548 x86_pmu.late_ack = true; 3241 x86_pmu.late_ack = true;
2549 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); 3242 memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
2550 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); 3243 memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
2551 3244
2552 intel_pmu_lbr_init_snb(); 3245 intel_pmu_lbr_init_hsw();
2553 3246
2554 x86_pmu.event_constraints = intel_hsw_event_constraints; 3247 x86_pmu.event_constraints = intel_hsw_event_constraints;
2555 x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; 3248 x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
2556 x86_pmu.extra_regs = intel_snbep_extra_regs; 3249 x86_pmu.extra_regs = intel_snbep_extra_regs;
2557 x86_pmu.pebs_aliases = intel_pebs_aliases_snb; 3250 x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
2558 /* all extra regs are per-cpu when HT is on */ 3251 /* all extra regs are per-cpu when HT is on */
2559 x86_pmu.er_flags |= ERF_HAS_RSP_1; 3252 x86_pmu.flags |= PMU_FL_HAS_RSP_1;
2560 x86_pmu.er_flags |= ERF_NO_HT_SHARING; 3253 x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
2561 3254
2562 x86_pmu.hw_config = hsw_hw_config; 3255 x86_pmu.hw_config = hsw_hw_config;
2563 x86_pmu.get_event_constraints = hsw_get_event_constraints; 3256 x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2566,6 +3259,39 @@ __init int intel_pmu_init(void)
2566 pr_cont("Haswell events, "); 3259 pr_cont("Haswell events, ");
2567 break; 3260 break;
2568 3261
3262 case 61: /* 14nm Broadwell Core-M */
3263 case 86: /* 14nm Broadwell Xeon D */
3264 x86_pmu.late_ack = true;
3265 memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
3266 memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
3267
3268 /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
3269 hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
3270 BDW_L3_MISS|HSW_SNOOP_DRAM;
3271 hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
3272 HSW_SNOOP_DRAM;
3273 hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
3274 BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
3275 hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
3276 BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
3277
3278 intel_pmu_lbr_init_hsw();
3279
3280 x86_pmu.event_constraints = intel_bdw_event_constraints;
3281 x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
3282 x86_pmu.extra_regs = intel_snbep_extra_regs;
3283 x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
3284 /* all extra regs are per-cpu when HT is on */
3285 x86_pmu.flags |= PMU_FL_HAS_RSP_1;
3286 x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
3287
3288 x86_pmu.hw_config = hsw_hw_config;
3289 x86_pmu.get_event_constraints = hsw_get_event_constraints;
3290 x86_pmu.cpu_events = hsw_events_attrs;
3291 x86_pmu.limit_period = bdw_limit_period;
3292 pr_cont("Broadwell events, ");
3293 break;
3294
2569 default: 3295 default:
2570 switch (x86_pmu.version) { 3296 switch (x86_pmu.version) {
2571 case 1: 3297 case 1:
@@ -2651,3 +3377,47 @@ __init int intel_pmu_init(void)
2651 3377
2652 return 0; 3378 return 0;
2653} 3379}
3380
3381/*
3382 * HT bug: phase 2 init
3383 * Called once we have valid topology information to check
3384 * whether or not HT is enabled
3385 * If HT is off, then we disable the workaround
3386 */
3387static __init int fixup_ht_bug(void)
3388{
3389 int cpu = smp_processor_id();
3390 int w, c;
3391 /*
3392 * problem not present on this CPU model, nothing to do
3393 */
3394 if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
3395 return 0;
3396
3397 w = cpumask_weight(topology_thread_cpumask(cpu));
3398 if (w > 1) {
3399 pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
3400 return 0;
3401 }
3402
3403 watchdog_nmi_disable_all();
3404
3405 x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
3406
3407 x86_pmu.commit_scheduling = NULL;
3408 x86_pmu.start_scheduling = NULL;
3409 x86_pmu.stop_scheduling = NULL;
3410
3411 watchdog_nmi_enable_all();
3412
3413 get_online_cpus();
3414
3415 for_each_online_cpu(c) {
3416 free_excl_cntrs(c);
3417 }
3418
3419 put_online_cpus();
3420 pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
3421 return 0;
3422}
3423subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
new file mode 100644
index 000000000000..ac1f0c55f379
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
1/*
2 * BTS PMU driver for perf
3 * Copyright (c) 2013-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#undef DEBUG
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19#include <linux/bitops.h>
20#include <linux/types.h>
21#include <linux/slab.h>
22#include <linux/debugfs.h>
23#include <linux/device.h>
24#include <linux/coredump.h>
25
26#include <asm-generic/sizes.h>
27#include <asm/perf_event.h>
28
29#include "perf_event.h"
30
31struct bts_ctx {
32 struct perf_output_handle handle;
33 struct debug_store ds_back;
34 int started;
35};
36
37static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
38
39#define BTS_RECORD_SIZE 24
40#define BTS_SAFETY_MARGIN 4080
41
42struct bts_phys {
43 struct page *page;
44 unsigned long size;
45 unsigned long offset;
46 unsigned long displacement;
47};
48
49struct bts_buffer {
50 size_t real_size; /* multiple of BTS_RECORD_SIZE */
51 unsigned int nr_pages;
52 unsigned int nr_bufs;
53 unsigned int cur_buf;
54 bool snapshot;
55 local_t data_size;
56 local_t lost;
57 local_t head;
58 unsigned long end;
59 void **data_pages;
60 struct bts_phys buf[0];
61};
62
63struct pmu bts_pmu;
64
65void intel_pmu_enable_bts(u64 config);
66void intel_pmu_disable_bts(void);
67
68static size_t buf_size(struct page *page)
69{
70 return 1 << (PAGE_SHIFT + page_private(page));
71}
72
73static void *
74bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
75{
76 struct bts_buffer *buf;
77 struct page *page;
78 int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
79 unsigned long offset;
80 size_t size = nr_pages << PAGE_SHIFT;
81 int pg, nbuf, pad;
82
83 /* count all the high order buffers */
84 for (pg = 0, nbuf = 0; pg < nr_pages;) {
85 page = virt_to_page(pages[pg]);
86 if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
87 return NULL;
88 pg += 1 << page_private(page);
89 nbuf++;
90 }
91
92 /*
93 * to avoid interrupts in overwrite mode, only allow one physical
94 */
95 if (overwrite && nbuf > 1)
96 return NULL;
97
98 buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
99 if (!buf)
100 return NULL;
101
102 buf->nr_pages = nr_pages;
103 buf->nr_bufs = nbuf;
104 buf->snapshot = overwrite;
105 buf->data_pages = pages;
106 buf->real_size = size - size % BTS_RECORD_SIZE;
107
108 for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
109 unsigned int __nr_pages;
110
111 page = virt_to_page(pages[pg]);
112 __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
113 buf->buf[nbuf].page = page;
114 buf->buf[nbuf].offset = offset;
115 buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
116 buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
117 pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
118 buf->buf[nbuf].size -= pad;
119
120 pg += __nr_pages;
121 offset += __nr_pages << PAGE_SHIFT;
122 }
123
124 return buf;
125}
126
127static void bts_buffer_free_aux(void *data)
128{
129 kfree(data);
130}
131
132static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
133{
134 return buf->buf[idx].offset + buf->buf[idx].displacement;
135}
136
137static void
138bts_config_buffer(struct bts_buffer *buf)
139{
140 int cpu = raw_smp_processor_id();
141 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
142 struct bts_phys *phys = &buf->buf[buf->cur_buf];
143 unsigned long index, thresh = 0, end = phys->size;
144 struct page *page = phys->page;
145
146 index = local_read(&buf->head);
147
148 if (!buf->snapshot) {
149 if (buf->end < phys->offset + buf_size(page))
150 end = buf->end - phys->offset - phys->displacement;
151
152 index -= phys->offset + phys->displacement;
153
154 if (end - index > BTS_SAFETY_MARGIN)
155 thresh = end - BTS_SAFETY_MARGIN;
156 else if (end - index > BTS_RECORD_SIZE)
157 thresh = end - BTS_RECORD_SIZE;
158 else
159 thresh = end;
160 }
161
162 ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
163 ds->bts_index = ds->bts_buffer_base + index;
164 ds->bts_absolute_maximum = ds->bts_buffer_base + end;
165 ds->bts_interrupt_threshold = !buf->snapshot
166 ? ds->bts_buffer_base + thresh
167 : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
168}
169
170static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
171{
172 unsigned long index = head - phys->offset;
173
174 memset(page_address(phys->page) + index, 0, phys->size - index);
175}
176
177static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
178{
179 if (buf->snapshot)
180 return false;
181
182 if (local_read(&buf->data_size) >= bts->handle.size ||
183 bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
184 return true;
185
186 return false;
187}
188
189static void bts_update(struct bts_ctx *bts)
190{
191 int cpu = raw_smp_processor_id();
192 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
193 struct bts_buffer *buf = perf_get_aux(&bts->handle);
194 unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
195
196 if (!buf)
197 return;
198
199 head = index + bts_buffer_offset(buf, buf->cur_buf);
200 old = local_xchg(&buf->head, head);
201
202 if (!buf->snapshot) {
203 if (old == head)
204 return;
205
206 if (ds->bts_index >= ds->bts_absolute_maximum)
207 local_inc(&buf->lost);
208
209 /*
210 * old and head are always in the same physical buffer, so we
211 * can subtract them to get the data size.
212 */
213 local_add(head - old, &buf->data_size);
214 } else {
215 local_set(&buf->data_size, head);
216 }
217}
218
219static void __bts_event_start(struct perf_event *event)
220{
221 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
222 struct bts_buffer *buf = perf_get_aux(&bts->handle);
223 u64 config = 0;
224
225 if (!buf || bts_buffer_is_full(buf, bts))
226 return;
227
228 event->hw.state = 0;
229
230 if (!buf->snapshot)
231 config |= ARCH_PERFMON_EVENTSEL_INT;
232 if (!event->attr.exclude_kernel)
233 config |= ARCH_PERFMON_EVENTSEL_OS;
234 if (!event->attr.exclude_user)
235 config |= ARCH_PERFMON_EVENTSEL_USR;
236
237 bts_config_buffer(buf);
238
239 /*
240 * local barrier to make sure that ds configuration made it
241 * before we enable BTS
242 */
243 wmb();
244
245 intel_pmu_enable_bts(config);
246}
247
248static void bts_event_start(struct perf_event *event, int flags)
249{
250 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
251
252 __bts_event_start(event);
253
254 /* PMI handler: this counter is running and likely generating PMIs */
255 ACCESS_ONCE(bts->started) = 1;
256}
257
258static void __bts_event_stop(struct perf_event *event)
259{
260 /*
261 * No extra synchronization is mandated by the documentation to have
262 * BTS data stores globally visible.
263 */
264 intel_pmu_disable_bts();
265
266 if (event->hw.state & PERF_HES_STOPPED)
267 return;
268
269 ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
270}
271
272static void bts_event_stop(struct perf_event *event, int flags)
273{
274 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
275
276 /* PMI handler: don't restart this counter */
277 ACCESS_ONCE(bts->started) = 0;
278
279 __bts_event_stop(event);
280
281 if (flags & PERF_EF_UPDATE)
282 bts_update(bts);
283}
284
285void intel_bts_enable_local(void)
286{
287 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
288
289 if (bts->handle.event && bts->started)
290 __bts_event_start(bts->handle.event);
291}
292
293void intel_bts_disable_local(void)
294{
295 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
296
297 if (bts->handle.event)
298 __bts_event_stop(bts->handle.event);
299}
300
301static int
302bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
303{
304 unsigned long head, space, next_space, pad, gap, skip, wakeup;
305 unsigned int next_buf;
306 struct bts_phys *phys, *next_phys;
307 int ret;
308
309 if (buf->snapshot)
310 return 0;
311
312 head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
313 if (WARN_ON_ONCE(head != local_read(&buf->head)))
314 return -EINVAL;
315
316 phys = &buf->buf[buf->cur_buf];
317 space = phys->offset + phys->displacement + phys->size - head;
318 pad = space;
319 if (space > handle->size) {
320 space = handle->size;
321 space -= space % BTS_RECORD_SIZE;
322 }
323 if (space <= BTS_SAFETY_MARGIN) {
324 /* See if next phys buffer has more space */
325 next_buf = buf->cur_buf + 1;
326 if (next_buf >= buf->nr_bufs)
327 next_buf = 0;
328 next_phys = &buf->buf[next_buf];
329 gap = buf_size(phys->page) - phys->displacement - phys->size +
330 next_phys->displacement;
331 skip = pad + gap;
332 if (handle->size >= skip) {
333 next_space = next_phys->size;
334 if (next_space + skip > handle->size) {
335 next_space = handle->size - skip;
336 next_space -= next_space % BTS_RECORD_SIZE;
337 }
338 if (next_space > space || !space) {
339 if (pad)
340 bts_buffer_pad_out(phys, head);
341 ret = perf_aux_output_skip(handle, skip);
342 if (ret)
343 return ret;
344 /* Advance to next phys buffer */
345 phys = next_phys;
346 space = next_space;
347 head = phys->offset + phys->displacement;
348 /*
349 * After this, cur_buf and head won't match ds
350 * anymore, so we must not be racing with
351 * bts_update().
352 */
353 buf->cur_buf = next_buf;
354 local_set(&buf->head, head);
355 }
356 }
357 }
358
359 /* Don't go far beyond wakeup watermark */
360 wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
361 handle->head;
362 if (space > wakeup) {
363 space = wakeup;
364 space -= space % BTS_RECORD_SIZE;
365 }
366
367 buf->end = head + space;
368
369 /*
370 * If we have no space, the lost notification would have been sent when
371 * we hit absolute_maximum - see bts_update()
372 */
373 if (!space)
374 return -ENOSPC;
375
376 return 0;
377}
378
379int intel_bts_interrupt(void)
380{
381 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
382 struct perf_event *event = bts->handle.event;
383 struct bts_buffer *buf;
384 s64 old_head;
385 int err;
386
387 if (!event || !bts->started)
388 return 0;
389
390 buf = perf_get_aux(&bts->handle);
391 /*
392 * Skip snapshot counters: they don't use the interrupt, but
393 * there's no other way of telling, because the pointer will
394 * keep moving
395 */
396 if (!buf || buf->snapshot)
397 return 0;
398
399 old_head = local_read(&buf->head);
400 bts_update(bts);
401
402 /* no new data */
403 if (old_head == local_read(&buf->head))
404 return 0;
405
406 perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
407 !!local_xchg(&buf->lost, 0));
408
409 buf = perf_aux_output_begin(&bts->handle, event);
410 if (!buf)
411 return 1;
412
413 err = bts_buffer_reset(buf, &bts->handle);
414 if (err)
415 perf_aux_output_end(&bts->handle, 0, false);
416
417 return 1;
418}
419
420static void bts_event_del(struct perf_event *event, int mode)
421{
422 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
423 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
424 struct bts_buffer *buf = perf_get_aux(&bts->handle);
425
426 bts_event_stop(event, PERF_EF_UPDATE);
427
428 if (buf) {
429 if (buf->snapshot)
430 bts->handle.head =
431 local_xchg(&buf->data_size,
432 buf->nr_pages << PAGE_SHIFT);
433 perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
434 !!local_xchg(&buf->lost, 0));
435 }
436
437 cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
438 cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
439 cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
440 cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
441}
442
443static int bts_event_add(struct perf_event *event, int mode)
444{
445 struct bts_buffer *buf;
446 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
447 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
448 struct hw_perf_event *hwc = &event->hw;
449 int ret = -EBUSY;
450
451 event->hw.state = PERF_HES_STOPPED;
452
453 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
454 return -EBUSY;
455
456 if (bts->handle.event)
457 return -EBUSY;
458
459 buf = perf_aux_output_begin(&bts->handle, event);
460 if (!buf)
461 return -EINVAL;
462
463 ret = bts_buffer_reset(buf, &bts->handle);
464 if (ret) {
465 perf_aux_output_end(&bts->handle, 0, false);
466 return ret;
467 }
468
469 bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
470 bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
471 bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
472
473 if (mode & PERF_EF_START) {
474 bts_event_start(event, 0);
475 if (hwc->state & PERF_HES_STOPPED) {
476 bts_event_del(event, 0);
477 return -EBUSY;
478 }
479 }
480
481 return 0;
482}
483
484static void bts_event_destroy(struct perf_event *event)
485{
486 x86_del_exclusive(x86_lbr_exclusive_bts);
487}
488
489static int bts_event_init(struct perf_event *event)
490{
491 if (event->attr.type != bts_pmu.type)
492 return -ENOENT;
493
494 if (x86_add_exclusive(x86_lbr_exclusive_bts))
495 return -EBUSY;
496
497 event->destroy = bts_event_destroy;
498
499 return 0;
500}
501
502static void bts_event_read(struct perf_event *event)
503{
504}
505
506static __init int bts_init(void)
507{
508 if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
509 return -ENODEV;
510
511 bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
512 bts_pmu.task_ctx_nr = perf_sw_context;
513 bts_pmu.event_init = bts_event_init;
514 bts_pmu.add = bts_event_add;
515 bts_pmu.del = bts_event_del;
516 bts_pmu.start = bts_event_start;
517 bts_pmu.stop = bts_event_stop;
518 bts_pmu.read = bts_event_read;
519 bts_pmu.setup_aux = bts_buffer_setup_aux;
520 bts_pmu.free_aux = bts_buffer_free_aux;
521
522 return perf_pmu_register(&bts_pmu, "intel_bts", -1);
523}
524
525module_init(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index 000000000000..e4d1b8b738fa
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,1379 @@
1/*
2 * Intel Cache Quality-of-Service Monitoring (CQM) support.
3 *
4 * Based very, very heavily on work by Peter Zijlstra.
5 */
6
7#include <linux/perf_event.h>
8#include <linux/slab.h>
9#include <asm/cpu_device_id.h>
10#include "perf_event.h"
11
12#define MSR_IA32_PQR_ASSOC 0x0c8f
13#define MSR_IA32_QM_CTR 0x0c8e
14#define MSR_IA32_QM_EVTSEL 0x0c8d
15
16static unsigned int cqm_max_rmid = -1;
17static unsigned int cqm_l3_scale; /* supposedly cacheline size */
18
19struct intel_cqm_state {
20 raw_spinlock_t lock;
21 int rmid;
22 int cnt;
23};
24
25static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
26
27/*
28 * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
29 * Also protects event->hw.cqm_rmid
30 *
31 * Hold either for stability, both for modification of ->hw.cqm_rmid.
32 */
33static DEFINE_MUTEX(cache_mutex);
34static DEFINE_RAW_SPINLOCK(cache_lock);
35
36/*
37 * Groups of events that have the same target(s), one RMID per group.
38 */
39static LIST_HEAD(cache_groups);
40
41/*
42 * Mask of CPUs for reading CQM values. We only need one per-socket.
43 */
44static cpumask_t cqm_cpumask;
45
46#define RMID_VAL_ERROR (1ULL << 63)
47#define RMID_VAL_UNAVAIL (1ULL << 62)
48
49#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
50
51#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
52
53/*
54 * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
55 *
56 * This rmid is always free and is guaranteed to have an associated
57 * near-zero occupancy value, i.e. no cachelines are tagged with this
58 * RMID, once __intel_cqm_rmid_rotate() returns.
59 */
60static unsigned int intel_cqm_rotation_rmid;
61
62#define INVALID_RMID (-1)
63
64/*
65 * Is @rmid valid for programming the hardware?
66 *
67 * rmid 0 is reserved by the hardware for all non-monitored tasks, which
68 * means that we should never come across an rmid with that value.
69 * Likewise, an rmid value of -1 is used to indicate "no rmid currently
70 * assigned" and is used as part of the rotation code.
71 */
72static inline bool __rmid_valid(unsigned int rmid)
73{
74 if (!rmid || rmid == INVALID_RMID)
75 return false;
76
77 return true;
78}
79
80static u64 __rmid_read(unsigned int rmid)
81{
82 u64 val;
83
84 /*
85 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
86 * it just says that to increase confusion.
87 */
88 wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
89 rdmsrl(MSR_IA32_QM_CTR, val);
90
91 /*
92 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
93 * the number of cachelines tagged with @rmid.
94 */
95 return val;
96}
97
98enum rmid_recycle_state {
99 RMID_YOUNG = 0,
100 RMID_AVAILABLE,
101 RMID_DIRTY,
102};
103
104struct cqm_rmid_entry {
105 unsigned int rmid;
106 enum rmid_recycle_state state;
107 struct list_head list;
108 unsigned long queue_time;
109};
110
111/*
112 * cqm_rmid_free_lru - A least recently used list of RMIDs.
113 *
114 * Oldest entry at the head, newest (most recently used) entry at the
115 * tail. This list is never traversed, it's only used to keep track of
116 * the lru order. That is, we only pick entries of the head or insert
117 * them on the tail.
118 *
119 * All entries on the list are 'free', and their RMIDs are not currently
120 * in use. To mark an RMID as in use, remove its entry from the lru
121 * list.
122 *
123 *
124 * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
125 *
126 * This list is contains RMIDs that no one is currently using but that
127 * may have a non-zero occupancy value associated with them. The
128 * rotation worker moves RMIDs from the limbo list to the free list once
129 * the occupancy value drops below __intel_cqm_threshold.
130 *
131 * Both lists are protected by cache_mutex.
132 */
133static LIST_HEAD(cqm_rmid_free_lru);
134static LIST_HEAD(cqm_rmid_limbo_lru);
135
136/*
137 * We use a simple array of pointers so that we can lookup a struct
138 * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
139 * and __put_rmid() from having to worry about dealing with struct
140 * cqm_rmid_entry - they just deal with rmids, i.e. integers.
141 *
142 * Once this array is initialized it is read-only. No locks are required
143 * to access it.
144 *
145 * All entries for all RMIDs can be looked up in the this array at all
146 * times.
147 */
148static struct cqm_rmid_entry **cqm_rmid_ptrs;
149
150static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
151{
152 struct cqm_rmid_entry *entry;
153
154 entry = cqm_rmid_ptrs[rmid];
155 WARN_ON(entry->rmid != rmid);
156
157 return entry;
158}
159
160/*
161 * Returns < 0 on fail.
162 *
163 * We expect to be called with cache_mutex held.
164 */
165static int __get_rmid(void)
166{
167 struct cqm_rmid_entry *entry;
168
169 lockdep_assert_held(&cache_mutex);
170
171 if (list_empty(&cqm_rmid_free_lru))
172 return INVALID_RMID;
173
174 entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
175 list_del(&entry->list);
176
177 return entry->rmid;
178}
179
180static void __put_rmid(unsigned int rmid)
181{
182 struct cqm_rmid_entry *entry;
183
184 lockdep_assert_held(&cache_mutex);
185
186 WARN_ON(!__rmid_valid(rmid));
187 entry = __rmid_entry(rmid);
188
189 entry->queue_time = jiffies;
190 entry->state = RMID_YOUNG;
191
192 list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
193}
194
195static int intel_cqm_setup_rmid_cache(void)
196{
197 struct cqm_rmid_entry *entry;
198 unsigned int nr_rmids;
199 int r = 0;
200
201 nr_rmids = cqm_max_rmid + 1;
202 cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
203 nr_rmids, GFP_KERNEL);
204 if (!cqm_rmid_ptrs)
205 return -ENOMEM;
206
207 for (; r <= cqm_max_rmid; r++) {
208 struct cqm_rmid_entry *entry;
209
210 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
211 if (!entry)
212 goto fail;
213
214 INIT_LIST_HEAD(&entry->list);
215 entry->rmid = r;
216 cqm_rmid_ptrs[r] = entry;
217
218 list_add_tail(&entry->list, &cqm_rmid_free_lru);
219 }
220
221 /*
222 * RMID 0 is special and is always allocated. It's used for all
223 * tasks that are not monitored.
224 */
225 entry = __rmid_entry(0);
226 list_del(&entry->list);
227
228 mutex_lock(&cache_mutex);
229 intel_cqm_rotation_rmid = __get_rmid();
230 mutex_unlock(&cache_mutex);
231
232 return 0;
233fail:
234 while (r--)
235 kfree(cqm_rmid_ptrs[r]);
236
237 kfree(cqm_rmid_ptrs);
238 return -ENOMEM;
239}
240
241/*
242 * Determine if @a and @b measure the same set of tasks.
243 *
244 * If @a and @b measure the same set of tasks then we want to share a
245 * single RMID.
246 */
247static bool __match_event(struct perf_event *a, struct perf_event *b)
248{
249 /* Per-cpu and task events don't mix */
250 if ((a->attach_state & PERF_ATTACH_TASK) !=
251 (b->attach_state & PERF_ATTACH_TASK))
252 return false;
253
254#ifdef CONFIG_CGROUP_PERF
255 if (a->cgrp != b->cgrp)
256 return false;
257#endif
258
259 /* If not task event, we're machine wide */
260 if (!(b->attach_state & PERF_ATTACH_TASK))
261 return true;
262
263 /*
264 * Events that target same task are placed into the same cache group.
265 */
266 if (a->hw.target == b->hw.target)
267 return true;
268
269 /*
270 * Are we an inherited event?
271 */
272 if (b->parent == a)
273 return true;
274
275 return false;
276}
277
278#ifdef CONFIG_CGROUP_PERF
279static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
280{
281 if (event->attach_state & PERF_ATTACH_TASK)
282 return perf_cgroup_from_task(event->hw.target);
283
284 return event->cgrp;
285}
286#endif
287
288/*
289 * Determine if @a's tasks intersect with @b's tasks
290 *
291 * There are combinations of events that we explicitly prohibit,
292 *
293 * PROHIBITS
294 * system-wide -> cgroup and task
295 * cgroup -> system-wide
296 * -> task in cgroup
297 * task -> system-wide
298 * -> task in cgroup
299 *
300 * Call this function before allocating an RMID.
301 */
302static bool __conflict_event(struct perf_event *a, struct perf_event *b)
303{
304#ifdef CONFIG_CGROUP_PERF
305 /*
306 * We can have any number of cgroups but only one system-wide
307 * event at a time.
308 */
309 if (a->cgrp && b->cgrp) {
310 struct perf_cgroup *ac = a->cgrp;
311 struct perf_cgroup *bc = b->cgrp;
312
313 /*
314 * This condition should have been caught in
315 * __match_event() and we should be sharing an RMID.
316 */
317 WARN_ON_ONCE(ac == bc);
318
319 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
320 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
321 return true;
322
323 return false;
324 }
325
326 if (a->cgrp || b->cgrp) {
327 struct perf_cgroup *ac, *bc;
328
329 /*
330 * cgroup and system-wide events are mutually exclusive
331 */
332 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
333 (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
334 return true;
335
336 /*
337 * Ensure neither event is part of the other's cgroup
338 */
339 ac = event_to_cgroup(a);
340 bc = event_to_cgroup(b);
341 if (ac == bc)
342 return true;
343
344 /*
345 * Must have cgroup and non-intersecting task events.
346 */
347 if (!ac || !bc)
348 return false;
349
350 /*
351 * We have cgroup and task events, and the task belongs
352 * to a cgroup. Check for for overlap.
353 */
354 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
355 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
356 return true;
357
358 return false;
359 }
360#endif
361 /*
362 * If one of them is not a task, same story as above with cgroups.
363 */
364 if (!(a->attach_state & PERF_ATTACH_TASK) ||
365 !(b->attach_state & PERF_ATTACH_TASK))
366 return true;
367
368 /*
369 * Must be non-overlapping.
370 */
371 return false;
372}
373
374struct rmid_read {
375 unsigned int rmid;
376 atomic64_t value;
377};
378
379static void __intel_cqm_event_count(void *info);
380
381/*
382 * Exchange the RMID of a group of events.
383 */
384static unsigned int
385intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
386{
387 struct perf_event *event;
388 unsigned int old_rmid = group->hw.cqm_rmid;
389 struct list_head *head = &group->hw.cqm_group_entry;
390
391 lockdep_assert_held(&cache_mutex);
392
393 /*
394 * If our RMID is being deallocated, perform a read now.
395 */
396 if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
397 struct rmid_read rr = {
398 .value = ATOMIC64_INIT(0),
399 .rmid = old_rmid,
400 };
401
402 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
403 &rr, 1);
404 local64_set(&group->count, atomic64_read(&rr.value));
405 }
406
407 raw_spin_lock_irq(&cache_lock);
408
409 group->hw.cqm_rmid = rmid;
410 list_for_each_entry(event, head, hw.cqm_group_entry)
411 event->hw.cqm_rmid = rmid;
412
413 raw_spin_unlock_irq(&cache_lock);
414
415 return old_rmid;
416}
417
418/*
419 * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
420 * cachelines are still tagged with RMIDs in limbo, we progressively
421 * increment the threshold until we find an RMID in limbo with <=
422 * __intel_cqm_threshold lines tagged. This is designed to mitigate the
423 * problem where cachelines tagged with an RMID are not steadily being
424 * evicted.
425 *
426 * On successful rotations we decrease the threshold back towards zero.
427 *
428 * __intel_cqm_max_threshold provides an upper bound on the threshold,
429 * and is measured in bytes because it's exposed to userland.
430 */
431static unsigned int __intel_cqm_threshold;
432static unsigned int __intel_cqm_max_threshold;
433
434/*
435 * Test whether an RMID has a zero occupancy value on this cpu.
436 */
437static void intel_cqm_stable(void *arg)
438{
439 struct cqm_rmid_entry *entry;
440
441 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
442 if (entry->state != RMID_AVAILABLE)
443 break;
444
445 if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
446 entry->state = RMID_DIRTY;
447 }
448}
449
450/*
451 * If we have group events waiting for an RMID that don't conflict with
452 * events already running, assign @rmid.
453 */
454static bool intel_cqm_sched_in_event(unsigned int rmid)
455{
456 struct perf_event *leader, *event;
457
458 lockdep_assert_held(&cache_mutex);
459
460 leader = list_first_entry(&cache_groups, struct perf_event,
461 hw.cqm_groups_entry);
462 event = leader;
463
464 list_for_each_entry_continue(event, &cache_groups,
465 hw.cqm_groups_entry) {
466 if (__rmid_valid(event->hw.cqm_rmid))
467 continue;
468
469 if (__conflict_event(event, leader))
470 continue;
471
472 intel_cqm_xchg_rmid(event, rmid);
473 return true;
474 }
475
476 return false;
477}
478
479/*
480 * Initially use this constant for both the limbo queue time and the
481 * rotation timer interval, pmu::hrtimer_interval_ms.
482 *
483 * They don't need to be the same, but the two are related since if you
484 * rotate faster than you recycle RMIDs, you may run out of available
485 * RMIDs.
486 */
487#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
488
489static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
490
491/*
492 * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
493 * @nr_available: number of freeable RMIDs on the limbo list
494 *
495 * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
496 * cachelines are tagged with those RMIDs. After this we can reuse them
497 * and know that the current set of active RMIDs is stable.
498 *
499 * Return %true or %false depending on whether stabilization needs to be
500 * reattempted.
501 *
502 * If we return %true then @nr_available is updated to indicate the
503 * number of RMIDs on the limbo list that have been queued for the
504 * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
505 * are above __intel_cqm_threshold.
506 */
507static bool intel_cqm_rmid_stabilize(unsigned int *available)
508{
509 struct cqm_rmid_entry *entry, *tmp;
510
511 lockdep_assert_held(&cache_mutex);
512
513 *available = 0;
514 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
515 unsigned long min_queue_time;
516 unsigned long now = jiffies;
517
518 /*
519 * We hold RMIDs placed into limbo for a minimum queue
520 * time. Before the minimum queue time has elapsed we do
521 * not recycle RMIDs.
522 *
523 * The reasoning is that until a sufficient time has
524 * passed since we stopped using an RMID, any RMID
525 * placed onto the limbo list will likely still have
526 * data tagged in the cache, which means we'll probably
527 * fail to recycle it anyway.
528 *
529 * We can save ourselves an expensive IPI by skipping
530 * any RMIDs that have not been queued for the minimum
531 * time.
532 */
533 min_queue_time = entry->queue_time +
534 msecs_to_jiffies(__rmid_queue_time_ms);
535
536 if (time_after(min_queue_time, now))
537 break;
538
539 entry->state = RMID_AVAILABLE;
540 (*available)++;
541 }
542
543 /*
544 * Fast return if none of the RMIDs on the limbo list have been
545 * sitting on the queue for the minimum queue time.
546 */
547 if (!*available)
548 return false;
549
550 /*
551 * Test whether an RMID is free for each package.
552 */
553 on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
554
555 list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
556 /*
557 * Exhausted all RMIDs that have waited min queue time.
558 */
559 if (entry->state == RMID_YOUNG)
560 break;
561
562 if (entry->state == RMID_DIRTY)
563 continue;
564
565 list_del(&entry->list); /* remove from limbo */
566
567 /*
568 * The rotation RMID gets priority if it's
569 * currently invalid. In which case, skip adding
570 * the RMID to the the free lru.
571 */
572 if (!__rmid_valid(intel_cqm_rotation_rmid)) {
573 intel_cqm_rotation_rmid = entry->rmid;
574 continue;
575 }
576
577 /*
578 * If we have groups waiting for RMIDs, hand
579 * them one now provided they don't conflict.
580 */
581 if (intel_cqm_sched_in_event(entry->rmid))
582 continue;
583
584 /*
585 * Otherwise place it onto the free list.
586 */
587 list_add_tail(&entry->list, &cqm_rmid_free_lru);
588 }
589
590
591 return __rmid_valid(intel_cqm_rotation_rmid);
592}
593
594/*
595 * Pick a victim group and move it to the tail of the group list.
596 * @next: The first group without an RMID
597 */
598static void __intel_cqm_pick_and_rotate(struct perf_event *next)
599{
600 struct perf_event *rotor;
601 unsigned int rmid;
602
603 lockdep_assert_held(&cache_mutex);
604
605 rotor = list_first_entry(&cache_groups, struct perf_event,
606 hw.cqm_groups_entry);
607
608 /*
609 * The group at the front of the list should always have a valid
610 * RMID. If it doesn't then no groups have RMIDs assigned and we
611 * don't need to rotate the list.
612 */
613 if (next == rotor)
614 return;
615
616 rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
617 __put_rmid(rmid);
618
619 list_rotate_left(&cache_groups);
620}
621
622/*
623 * Deallocate the RMIDs from any events that conflict with @event, and
624 * place them on the back of the group list.
625 */
626static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
627{
628 struct perf_event *group, *g;
629 unsigned int rmid;
630
631 lockdep_assert_held(&cache_mutex);
632
633 list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
634 if (group == event)
635 continue;
636
637 rmid = group->hw.cqm_rmid;
638
639 /*
640 * Skip events that don't have a valid RMID.
641 */
642 if (!__rmid_valid(rmid))
643 continue;
644
645 /*
646 * No conflict? No problem! Leave the event alone.
647 */
648 if (!__conflict_event(group, event))
649 continue;
650
651 intel_cqm_xchg_rmid(group, INVALID_RMID);
652 __put_rmid(rmid);
653 }
654}
655
656/*
657 * Attempt to rotate the groups and assign new RMIDs.
658 *
659 * We rotate for two reasons,
660 * 1. To handle the scheduling of conflicting events
661 * 2. To recycle RMIDs
662 *
663 * Rotating RMIDs is complicated because the hardware doesn't give us
664 * any clues.
665 *
666 * There's problems with the hardware interface; when you change the
667 * task:RMID map cachelines retain their 'old' tags, giving a skewed
668 * picture. In order to work around this, we must always keep one free
669 * RMID - intel_cqm_rotation_rmid.
670 *
671 * Rotation works by taking away an RMID from a group (the old RMID),
672 * and assigning the free RMID to another group (the new RMID). We must
673 * then wait for the old RMID to not be used (no cachelines tagged).
674 * This ensure that all cachelines are tagged with 'active' RMIDs. At
675 * this point we can start reading values for the new RMID and treat the
676 * old RMID as the free RMID for the next rotation.
677 *
678 * Return %true or %false depending on whether we did any rotating.
679 */
680static bool __intel_cqm_rmid_rotate(void)
681{
682 struct perf_event *group, *start = NULL;
683 unsigned int threshold_limit;
684 unsigned int nr_needed = 0;
685 unsigned int nr_available;
686 bool rotated = false;
687
688 mutex_lock(&cache_mutex);
689
690again:
691 /*
692 * Fast path through this function if there are no groups and no
693 * RMIDs that need cleaning.
694 */
695 if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
696 goto out;
697
698 list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
699 if (!__rmid_valid(group->hw.cqm_rmid)) {
700 if (!start)
701 start = group;
702 nr_needed++;
703 }
704 }
705
706 /*
707 * We have some event groups, but they all have RMIDs assigned
708 * and no RMIDs need cleaning.
709 */
710 if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
711 goto out;
712
713 if (!nr_needed)
714 goto stabilize;
715
716 /*
717 * We have more event groups without RMIDs than available RMIDs,
718 * or we have event groups that conflict with the ones currently
719 * scheduled.
720 *
721 * We force deallocate the rmid of the group at the head of
722 * cache_groups. The first event group without an RMID then gets
723 * assigned intel_cqm_rotation_rmid. This ensures we always make
724 * forward progress.
725 *
726 * Rotate the cache_groups list so the previous head is now the
727 * tail.
728 */
729 __intel_cqm_pick_and_rotate(start);
730
731 /*
732 * If the rotation is going to succeed, reduce the threshold so
733 * that we don't needlessly reuse dirty RMIDs.
734 */
735 if (__rmid_valid(intel_cqm_rotation_rmid)) {
736 intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
737 intel_cqm_rotation_rmid = __get_rmid();
738
739 intel_cqm_sched_out_conflicting_events(start);
740
741 if (__intel_cqm_threshold)
742 __intel_cqm_threshold--;
743 }
744
745 rotated = true;
746
747stabilize:
748 /*
749 * We now need to stablize the RMID we freed above (if any) to
750 * ensure that the next time we rotate we have an RMID with zero
751 * occupancy value.
752 *
753 * Alternatively, if we didn't need to perform any rotation,
754 * we'll have a bunch of RMIDs in limbo that need stabilizing.
755 */
756 threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
757
758 while (intel_cqm_rmid_stabilize(&nr_available) &&
759 __intel_cqm_threshold < threshold_limit) {
760 unsigned int steal_limit;
761
762 /*
763 * Don't spin if nobody is actively waiting for an RMID,
764 * the rotation worker will be kicked as soon as an
765 * event needs an RMID anyway.
766 */
767 if (!nr_needed)
768 break;
769
770 /* Allow max 25% of RMIDs to be in limbo. */
771 steal_limit = (cqm_max_rmid + 1) / 4;
772
773 /*
774 * We failed to stabilize any RMIDs so our rotation
775 * logic is now stuck. In order to make forward progress
776 * we have a few options:
777 *
778 * 1. rotate ("steal") another RMID
779 * 2. increase the threshold
780 * 3. do nothing
781 *
782 * We do both of 1. and 2. until we hit the steal limit.
783 *
784 * The steal limit prevents all RMIDs ending up on the
785 * limbo list. This can happen if every RMID has a
786 * non-zero occupancy above threshold_limit, and the
787 * occupancy values aren't dropping fast enough.
788 *
789 * Note that there is prioritisation at work here - we'd
790 * rather increase the number of RMIDs on the limbo list
791 * than increase the threshold, because increasing the
792 * threshold skews the event data (because we reuse
793 * dirty RMIDs) - threshold bumps are a last resort.
794 */
795 if (nr_available < steal_limit)
796 goto again;
797
798 __intel_cqm_threshold++;
799 }
800
801out:
802 mutex_unlock(&cache_mutex);
803 return rotated;
804}
805
806static void intel_cqm_rmid_rotate(struct work_struct *work);
807
808static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
809
810static struct pmu intel_cqm_pmu;
811
812static void intel_cqm_rmid_rotate(struct work_struct *work)
813{
814 unsigned long delay;
815
816 __intel_cqm_rmid_rotate();
817
818 delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
819 schedule_delayed_work(&intel_cqm_rmid_work, delay);
820}
821
822/*
823 * Find a group and setup RMID.
824 *
825 * If we're part of a group, we use the group's RMID.
826 */
827static void intel_cqm_setup_event(struct perf_event *event,
828 struct perf_event **group)
829{
830 struct perf_event *iter;
831 unsigned int rmid;
832 bool conflict = false;
833
834 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
835 rmid = iter->hw.cqm_rmid;
836
837 if (__match_event(iter, event)) {
838 /* All tasks in a group share an RMID */
839 event->hw.cqm_rmid = rmid;
840 *group = iter;
841 return;
842 }
843
844 /*
845 * We only care about conflicts for events that are
846 * actually scheduled in (and hence have a valid RMID).
847 */
848 if (__conflict_event(iter, event) && __rmid_valid(rmid))
849 conflict = true;
850 }
851
852 if (conflict)
853 rmid = INVALID_RMID;
854 else
855 rmid = __get_rmid();
856
857 event->hw.cqm_rmid = rmid;
858}
859
860static void intel_cqm_event_read(struct perf_event *event)
861{
862 unsigned long flags;
863 unsigned int rmid;
864 u64 val;
865
866 /*
867 * Task events are handled by intel_cqm_event_count().
868 */
869 if (event->cpu == -1)
870 return;
871
872 raw_spin_lock_irqsave(&cache_lock, flags);
873 rmid = event->hw.cqm_rmid;
874
875 if (!__rmid_valid(rmid))
876 goto out;
877
878 val = __rmid_read(rmid);
879
880 /*
881 * Ignore this reading on error states and do not update the value.
882 */
883 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
884 goto out;
885
886 local64_set(&event->count, val);
887out:
888 raw_spin_unlock_irqrestore(&cache_lock, flags);
889}
890
891static void __intel_cqm_event_count(void *info)
892{
893 struct rmid_read *rr = info;
894 u64 val;
895
896 val = __rmid_read(rr->rmid);
897
898 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
899 return;
900
901 atomic64_add(val, &rr->value);
902}
903
904static inline bool cqm_group_leader(struct perf_event *event)
905{
906 return !list_empty(&event->hw.cqm_groups_entry);
907}
908
909static u64 intel_cqm_event_count(struct perf_event *event)
910{
911 unsigned long flags;
912 struct rmid_read rr = {
913 .value = ATOMIC64_INIT(0),
914 };
915
916 /*
917 * We only need to worry about task events. System-wide events
918 * are handled like usual, i.e. entirely with
919 * intel_cqm_event_read().
920 */
921 if (event->cpu != -1)
922 return __perf_event_count(event);
923
924 /*
925 * Only the group leader gets to report values. This stops us
926 * reporting duplicate values to userspace, and gives us a clear
927 * rule for which task gets to report the values.
928 *
929 * Note that it is impossible to attribute these values to
930 * specific packages - we forfeit that ability when we create
931 * task events.
932 */
933 if (!cqm_group_leader(event))
934 return 0;
935
936 /*
937 * Notice that we don't perform the reading of an RMID
938 * atomically, because we can't hold a spin lock across the
939 * IPIs.
940 *
941 * Speculatively perform the read, since @event might be
942 * assigned a different (possibly invalid) RMID while we're
943 * busying performing the IPI calls. It's therefore necessary to
944 * check @event's RMID afterwards, and if it has changed,
945 * discard the result of the read.
946 */
947 rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
948
949 if (!__rmid_valid(rr.rmid))
950 goto out;
951
952 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
953
954 raw_spin_lock_irqsave(&cache_lock, flags);
955 if (event->hw.cqm_rmid == rr.rmid)
956 local64_set(&event->count, atomic64_read(&rr.value));
957 raw_spin_unlock_irqrestore(&cache_lock, flags);
958out:
959 return __perf_event_count(event);
960}
961
962static void intel_cqm_event_start(struct perf_event *event, int mode)
963{
964 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
965 unsigned int rmid = event->hw.cqm_rmid;
966 unsigned long flags;
967
968 if (!(event->hw.cqm_state & PERF_HES_STOPPED))
969 return;
970
971 event->hw.cqm_state &= ~PERF_HES_STOPPED;
972
973 raw_spin_lock_irqsave(&state->lock, flags);
974
975 if (state->cnt++)
976 WARN_ON_ONCE(state->rmid != rmid);
977 else
978 WARN_ON_ONCE(state->rmid);
979
980 state->rmid = rmid;
981 wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
982
983 raw_spin_unlock_irqrestore(&state->lock, flags);
984}
985
986static void intel_cqm_event_stop(struct perf_event *event, int mode)
987{
988 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
989 unsigned long flags;
990
991 if (event->hw.cqm_state & PERF_HES_STOPPED)
992 return;
993
994 event->hw.cqm_state |= PERF_HES_STOPPED;
995
996 raw_spin_lock_irqsave(&state->lock, flags);
997 intel_cqm_event_read(event);
998
999 if (!--state->cnt) {
1000 state->rmid = 0;
1001 wrmsrl(MSR_IA32_PQR_ASSOC, 0);
1002 } else {
1003 WARN_ON_ONCE(!state->rmid);
1004 }
1005
1006 raw_spin_unlock_irqrestore(&state->lock, flags);
1007}
1008
1009static int intel_cqm_event_add(struct perf_event *event, int mode)
1010{
1011 unsigned long flags;
1012 unsigned int rmid;
1013
1014 raw_spin_lock_irqsave(&cache_lock, flags);
1015
1016 event->hw.cqm_state = PERF_HES_STOPPED;
1017 rmid = event->hw.cqm_rmid;
1018
1019 if (__rmid_valid(rmid) && (mode & PERF_EF_START))
1020 intel_cqm_event_start(event, mode);
1021
1022 raw_spin_unlock_irqrestore(&cache_lock, flags);
1023
1024 return 0;
1025}
1026
1027static void intel_cqm_event_del(struct perf_event *event, int mode)
1028{
1029 intel_cqm_event_stop(event, mode);
1030}
1031
1032static void intel_cqm_event_destroy(struct perf_event *event)
1033{
1034 struct perf_event *group_other = NULL;
1035
1036 mutex_lock(&cache_mutex);
1037
1038 /*
1039 * If there's another event in this group...
1040 */
1041 if (!list_empty(&event->hw.cqm_group_entry)) {
1042 group_other = list_first_entry(&event->hw.cqm_group_entry,
1043 struct perf_event,
1044 hw.cqm_group_entry);
1045 list_del(&event->hw.cqm_group_entry);
1046 }
1047
1048 /*
1049 * And we're the group leader..
1050 */
1051 if (cqm_group_leader(event)) {
1052 /*
1053 * If there was a group_other, make that leader, otherwise
1054 * destroy the group and return the RMID.
1055 */
1056 if (group_other) {
1057 list_replace(&event->hw.cqm_groups_entry,
1058 &group_other->hw.cqm_groups_entry);
1059 } else {
1060 unsigned int rmid = event->hw.cqm_rmid;
1061
1062 if (__rmid_valid(rmid))
1063 __put_rmid(rmid);
1064 list_del(&event->hw.cqm_groups_entry);
1065 }
1066 }
1067
1068 mutex_unlock(&cache_mutex);
1069}
1070
1071static int intel_cqm_event_init(struct perf_event *event)
1072{
1073 struct perf_event *group = NULL;
1074 bool rotate = false;
1075
1076 if (event->attr.type != intel_cqm_pmu.type)
1077 return -ENOENT;
1078
1079 if (event->attr.config & ~QOS_EVENT_MASK)
1080 return -EINVAL;
1081
1082 /* unsupported modes and filters */
1083 if (event->attr.exclude_user ||
1084 event->attr.exclude_kernel ||
1085 event->attr.exclude_hv ||
1086 event->attr.exclude_idle ||
1087 event->attr.exclude_host ||
1088 event->attr.exclude_guest ||
1089 event->attr.sample_period) /* no sampling */
1090 return -EINVAL;
1091
1092 INIT_LIST_HEAD(&event->hw.cqm_group_entry);
1093 INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
1094
1095 event->destroy = intel_cqm_event_destroy;
1096
1097 mutex_lock(&cache_mutex);
1098
1099 /* Will also set rmid */
1100 intel_cqm_setup_event(event, &group);
1101
1102 if (group) {
1103 list_add_tail(&event->hw.cqm_group_entry,
1104 &group->hw.cqm_group_entry);
1105 } else {
1106 list_add_tail(&event->hw.cqm_groups_entry,
1107 &cache_groups);
1108
1109 /*
1110 * All RMIDs are either in use or have recently been
1111 * used. Kick the rotation worker to clean/free some.
1112 *
1113 * We only do this for the group leader, rather than for
1114 * every event in a group to save on needless work.
1115 */
1116 if (!__rmid_valid(event->hw.cqm_rmid))
1117 rotate = true;
1118 }
1119
1120 mutex_unlock(&cache_mutex);
1121
1122 if (rotate)
1123 schedule_delayed_work(&intel_cqm_rmid_work, 0);
1124
1125 return 0;
1126}
1127
1128EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
1129EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
1130EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
1131EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
1132EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
1133
1134static struct attribute *intel_cqm_events_attr[] = {
1135 EVENT_PTR(intel_cqm_llc),
1136 EVENT_PTR(intel_cqm_llc_pkg),
1137 EVENT_PTR(intel_cqm_llc_unit),
1138 EVENT_PTR(intel_cqm_llc_scale),
1139 EVENT_PTR(intel_cqm_llc_snapshot),
1140 NULL,
1141};
1142
1143static struct attribute_group intel_cqm_events_group = {
1144 .name = "events",
1145 .attrs = intel_cqm_events_attr,
1146};
1147
1148PMU_FORMAT_ATTR(event, "config:0-7");
1149static struct attribute *intel_cqm_formats_attr[] = {
1150 &format_attr_event.attr,
1151 NULL,
1152};
1153
1154static struct attribute_group intel_cqm_format_group = {
1155 .name = "format",
1156 .attrs = intel_cqm_formats_attr,
1157};
1158
1159static ssize_t
1160max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
1161 char *page)
1162{
1163 ssize_t rv;
1164
1165 mutex_lock(&cache_mutex);
1166 rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
1167 mutex_unlock(&cache_mutex);
1168
1169 return rv;
1170}
1171
1172static ssize_t
1173max_recycle_threshold_store(struct device *dev,
1174 struct device_attribute *attr,
1175 const char *buf, size_t count)
1176{
1177 unsigned int bytes, cachelines;
1178 int ret;
1179
1180 ret = kstrtouint(buf, 0, &bytes);
1181 if (ret)
1182 return ret;
1183
1184 mutex_lock(&cache_mutex);
1185
1186 __intel_cqm_max_threshold = bytes;
1187 cachelines = bytes / cqm_l3_scale;
1188
1189 /*
1190 * The new maximum takes effect immediately.
1191 */
1192 if (__intel_cqm_threshold > cachelines)
1193 __intel_cqm_threshold = cachelines;
1194
1195 mutex_unlock(&cache_mutex);
1196
1197 return count;
1198}
1199
1200static DEVICE_ATTR_RW(max_recycle_threshold);
1201
1202static struct attribute *intel_cqm_attrs[] = {
1203 &dev_attr_max_recycle_threshold.attr,
1204 NULL,
1205};
1206
1207static const struct attribute_group intel_cqm_group = {
1208 .attrs = intel_cqm_attrs,
1209};
1210
1211static const struct attribute_group *intel_cqm_attr_groups[] = {
1212 &intel_cqm_events_group,
1213 &intel_cqm_format_group,
1214 &intel_cqm_group,
1215 NULL,
1216};
1217
1218static struct pmu intel_cqm_pmu = {
1219 .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
1220 .attr_groups = intel_cqm_attr_groups,
1221 .task_ctx_nr = perf_sw_context,
1222 .event_init = intel_cqm_event_init,
1223 .add = intel_cqm_event_add,
1224 .del = intel_cqm_event_del,
1225 .start = intel_cqm_event_start,
1226 .stop = intel_cqm_event_stop,
1227 .read = intel_cqm_event_read,
1228 .count = intel_cqm_event_count,
1229};
1230
1231static inline void cqm_pick_event_reader(int cpu)
1232{
1233 int phys_id = topology_physical_package_id(cpu);
1234 int i;
1235
1236 for_each_cpu(i, &cqm_cpumask) {
1237 if (phys_id == topology_physical_package_id(i))
1238 return; /* already got reader for this socket */
1239 }
1240
1241 cpumask_set_cpu(cpu, &cqm_cpumask);
1242}
1243
1244static void intel_cqm_cpu_prepare(unsigned int cpu)
1245{
1246 struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
1247 struct cpuinfo_x86 *c = &cpu_data(cpu);
1248
1249 raw_spin_lock_init(&state->lock);
1250 state->rmid = 0;
1251 state->cnt = 0;
1252
1253 WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
1254 WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
1255}
1256
1257static void intel_cqm_cpu_exit(unsigned int cpu)
1258{
1259 int phys_id = topology_physical_package_id(cpu);
1260 int i;
1261
1262 /*
1263 * Is @cpu a designated cqm reader?
1264 */
1265 if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
1266 return;
1267
1268 for_each_online_cpu(i) {
1269 if (i == cpu)
1270 continue;
1271
1272 if (phys_id == topology_physical_package_id(i)) {
1273 cpumask_set_cpu(i, &cqm_cpumask);
1274 break;
1275 }
1276 }
1277}
1278
1279static int intel_cqm_cpu_notifier(struct notifier_block *nb,
1280 unsigned long action, void *hcpu)
1281{
1282 unsigned int cpu = (unsigned long)hcpu;
1283
1284 switch (action & ~CPU_TASKS_FROZEN) {
1285 case CPU_UP_PREPARE:
1286 intel_cqm_cpu_prepare(cpu);
1287 break;
1288 case CPU_DOWN_PREPARE:
1289 intel_cqm_cpu_exit(cpu);
1290 break;
1291 case CPU_STARTING:
1292 cqm_pick_event_reader(cpu);
1293 break;
1294 }
1295
1296 return NOTIFY_OK;
1297}
1298
1299static const struct x86_cpu_id intel_cqm_match[] = {
1300 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
1301 {}
1302};
1303
1304static int __init intel_cqm_init(void)
1305{
1306 char *str, scale[20];
1307 int i, cpu, ret;
1308
1309 if (!x86_match_cpu(intel_cqm_match))
1310 return -ENODEV;
1311
1312 cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
1313
1314 /*
1315 * It's possible that not all resources support the same number
1316 * of RMIDs. Instead of making scheduling much more complicated
1317 * (where we have to match a task's RMID to a cpu that supports
1318 * that many RMIDs) just find the minimum RMIDs supported across
1319 * all cpus.
1320 *
1321 * Also, check that the scales match on all cpus.
1322 */
1323 cpu_notifier_register_begin();
1324
1325 for_each_online_cpu(cpu) {
1326 struct cpuinfo_x86 *c = &cpu_data(cpu);
1327
1328 if (c->x86_cache_max_rmid < cqm_max_rmid)
1329 cqm_max_rmid = c->x86_cache_max_rmid;
1330
1331 if (c->x86_cache_occ_scale != cqm_l3_scale) {
1332 pr_err("Multiple LLC scale values, disabling\n");
1333 ret = -EINVAL;
1334 goto out;
1335 }
1336 }
1337
1338 /*
1339 * A reasonable upper limit on the max threshold is the number
1340 * of lines tagged per RMID if all RMIDs have the same number of
1341 * lines tagged in the LLC.
1342 *
1343 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
1344 */
1345 __intel_cqm_max_threshold =
1346 boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
1347
1348 snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
1349 str = kstrdup(scale, GFP_KERNEL);
1350 if (!str) {
1351 ret = -ENOMEM;
1352 goto out;
1353 }
1354
1355 event_attr_intel_cqm_llc_scale.event_str = str;
1356
1357 ret = intel_cqm_setup_rmid_cache();
1358 if (ret)
1359 goto out;
1360
1361 for_each_online_cpu(i) {
1362 intel_cqm_cpu_prepare(i);
1363 cqm_pick_event_reader(i);
1364 }
1365
1366 __perf_cpu_notifier(intel_cqm_cpu_notifier);
1367
1368 ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
1369 if (ret)
1370 pr_err("Intel CQM perf registration failed: %d\n", ret);
1371 else
1372 pr_info("Intel CQM monitoring enabled\n");
1373
1374out:
1375 cpu_notifier_register_done();
1376
1377 return ret;
1378}
1379device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 073983398364..813f75d71175 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
461 461
462 debugctlmsr |= DEBUGCTLMSR_TR; 462 debugctlmsr |= DEBUGCTLMSR_TR;
463 debugctlmsr |= DEBUGCTLMSR_BTS; 463 debugctlmsr |= DEBUGCTLMSR_BTS;
464 debugctlmsr |= DEBUGCTLMSR_BTINT; 464 if (config & ARCH_PERFMON_EVENTSEL_INT)
465 debugctlmsr |= DEBUGCTLMSR_BTINT;
465 466
466 if (!(config & ARCH_PERFMON_EVENTSEL_OS)) 467 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
467 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; 468 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
@@ -557,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = {
557 INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ 558 INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
558 INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ 559 INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
559 INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ 560 INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
561 /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
562 INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
560 EVENT_CONSTRAINT_END 563 EVENT_CONSTRAINT_END
561}; 564};
562 565
@@ -564,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
564 INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ 567 INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
565 INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ 568 INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
566 INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ 569 INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
570 /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
571 INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
567 EVENT_CONSTRAINT_END 572 EVENT_CONSTRAINT_END
568}; 573};
569 574
@@ -587,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
587 INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ 592 INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
588 INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ 593 INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
589 INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ 594 INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
595 /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
596 INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
590 EVENT_CONSTRAINT_END 597 EVENT_CONSTRAINT_END
591}; 598};
592 599
@@ -602,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
602 INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ 609 INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
603 INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ 610 INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
604 INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ 611 INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
612 /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
613 INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
605 EVENT_CONSTRAINT_END 614 EVENT_CONSTRAINT_END
606}; 615};
607 616
@@ -611,6 +620,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
611 INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ 620 INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
612 /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ 621 /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
613 INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), 622 INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
623 INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
624 INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
625 INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
626 INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
614 /* Allow all events as PEBS with no flags */ 627 /* Allow all events as PEBS with no flags */
615 INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), 628 INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
616 EVENT_CONSTRAINT_END 629 EVENT_CONSTRAINT_END
@@ -622,6 +635,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
622 INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ 635 INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
623 /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ 636 /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
624 INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), 637 INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
638 INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
639 INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
640 INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
641 INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
625 /* Allow all events as PEBS with no flags */ 642 /* Allow all events as PEBS with no flags */
626 INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), 643 INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
627 EVENT_CONSTRAINT_END 644 EVENT_CONSTRAINT_END
@@ -633,16 +650,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
633 /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ 650 /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
634 INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), 651 INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
635 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ 652 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
636 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ 653 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
637 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ 654 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
638 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ 655 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
639 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ 656 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
640 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ 657 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
641 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ 658 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
642 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ 659 INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
643 INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ 660 INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
644 INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ 661 INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
645 INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ 662 INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
646 /* Allow all events as PEBS with no flags */ 663 /* Allow all events as PEBS with no flags */
647 INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), 664 INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
648 EVENT_CONSTRAINT_END 665 EVENT_CONSTRAINT_END
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 58f1a94beaf0..94e5b506caa6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
39#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ 39#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
40#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ 40#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
41#define LBR_FAR_BIT 8 /* do not capture far branches */ 41#define LBR_FAR_BIT 8 /* do not capture far branches */
42#define LBR_CALL_STACK_BIT 9 /* enable call stack */
42 43
43#define LBR_KERNEL (1 << LBR_KERNEL_BIT) 44#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
44#define LBR_USER (1 << LBR_USER_BIT) 45#define LBR_USER (1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
49#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) 50#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
50#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) 51#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
51#define LBR_FAR (1 << LBR_FAR_BIT) 52#define LBR_FAR (1 << LBR_FAR_BIT)
53#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
52 54
53#define LBR_PLM (LBR_KERNEL | LBR_USER) 55#define LBR_PLM (LBR_KERNEL | LBR_USER)
54 56
@@ -69,33 +71,31 @@ static enum {
69#define LBR_FROM_FLAG_IN_TX (1ULL << 62) 71#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
70#define LBR_FROM_FLAG_ABORT (1ULL << 61) 72#define LBR_FROM_FLAG_ABORT (1ULL << 61)
71 73
72#define for_each_branch_sample_type(x) \
73 for ((x) = PERF_SAMPLE_BRANCH_USER; \
74 (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
75
76/* 74/*
77 * x86control flow change classification 75 * x86control flow change classification
78 * x86control flow changes include branches, interrupts, traps, faults 76 * x86control flow changes include branches, interrupts, traps, faults
79 */ 77 */
80enum { 78enum {
81 X86_BR_NONE = 0, /* unknown */ 79 X86_BR_NONE = 0, /* unknown */
82 80
83 X86_BR_USER = 1 << 0, /* branch target is user */ 81 X86_BR_USER = 1 << 0, /* branch target is user */
84 X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ 82 X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
85 83
86 X86_BR_CALL = 1 << 2, /* call */ 84 X86_BR_CALL = 1 << 2, /* call */
87 X86_BR_RET = 1 << 3, /* return */ 85 X86_BR_RET = 1 << 3, /* return */
88 X86_BR_SYSCALL = 1 << 4, /* syscall */ 86 X86_BR_SYSCALL = 1 << 4, /* syscall */
89 X86_BR_SYSRET = 1 << 5, /* syscall return */ 87 X86_BR_SYSRET = 1 << 5, /* syscall return */
90 X86_BR_INT = 1 << 6, /* sw interrupt */ 88 X86_BR_INT = 1 << 6, /* sw interrupt */
91 X86_BR_IRET = 1 << 7, /* return from interrupt */ 89 X86_BR_IRET = 1 << 7, /* return from interrupt */
92 X86_BR_JCC = 1 << 8, /* conditional */ 90 X86_BR_JCC = 1 << 8, /* conditional */
93 X86_BR_JMP = 1 << 9, /* jump */ 91 X86_BR_JMP = 1 << 9, /* jump */
94 X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ 92 X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
95 X86_BR_IND_CALL = 1 << 11,/* indirect calls */ 93 X86_BR_IND_CALL = 1 << 11,/* indirect calls */
96 X86_BR_ABORT = 1 << 12,/* transaction abort */ 94 X86_BR_ABORT = 1 << 12,/* transaction abort */
97 X86_BR_IN_TX = 1 << 13,/* in transaction */ 95 X86_BR_IN_TX = 1 << 13,/* in transaction */
98 X86_BR_NO_TX = 1 << 14,/* not in transaction */ 96 X86_BR_NO_TX = 1 << 14,/* not in transaction */
97 X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
98 X86_BR_CALL_STACK = 1 << 16,/* call stack */
99}; 99};
100 100
101#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) 101#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -112,13 +112,15 @@ enum {
112 X86_BR_JMP |\ 112 X86_BR_JMP |\
113 X86_BR_IRQ |\ 113 X86_BR_IRQ |\
114 X86_BR_ABORT |\ 114 X86_BR_ABORT |\
115 X86_BR_IND_CALL) 115 X86_BR_IND_CALL |\
116 X86_BR_ZERO_CALL)
116 117
117#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) 118#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
118 119
119#define X86_BR_ANY_CALL \ 120#define X86_BR_ANY_CALL \
120 (X86_BR_CALL |\ 121 (X86_BR_CALL |\
121 X86_BR_IND_CALL |\ 122 X86_BR_IND_CALL |\
123 X86_BR_ZERO_CALL |\
122 X86_BR_SYSCALL |\ 124 X86_BR_SYSCALL |\
123 X86_BR_IRQ |\ 125 X86_BR_IRQ |\
124 X86_BR_INT) 126 X86_BR_INT)
@@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
130 * otherwise it becomes near impossible to get a reliable stack. 132 * otherwise it becomes near impossible to get a reliable stack.
131 */ 133 */
132 134
133static void __intel_pmu_lbr_enable(void) 135static void __intel_pmu_lbr_enable(bool pmi)
134{ 136{
135 u64 debugctl;
136 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 137 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
138 u64 debugctl, lbr_select = 0, orig_debugctl;
137 139
138 if (cpuc->lbr_sel) 140 /*
139 wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); 141 * No need to reprogram LBR_SELECT in a PMI, as it
142 * did not change.
143 */
144 if (cpuc->lbr_sel && !pmi) {
145 lbr_select = cpuc->lbr_sel->config;
146 wrmsrl(MSR_LBR_SELECT, lbr_select);
147 }
140 148
141 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 149 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
142 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 150 orig_debugctl = debugctl;
143 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 151 debugctl |= DEBUGCTLMSR_LBR;
152 /*
153 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
154 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
155 * may cause superfluous increase/decrease of LBR_TOS.
156 */
157 if (!(lbr_select & LBR_CALL_STACK))
158 debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
159 if (orig_debugctl != debugctl)
160 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
144} 161}
145 162
146static void __intel_pmu_lbr_disable(void) 163static void __intel_pmu_lbr_disable(void)
@@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void)
181 intel_pmu_lbr_reset_64(); 198 intel_pmu_lbr_reset_64();
182} 199}
183 200
201/*
202 * TOS = most recently recorded branch
203 */
204static inline u64 intel_pmu_lbr_tos(void)
205{
206 u64 tos;
207
208 rdmsrl(x86_pmu.lbr_tos, tos);
209 return tos;
210}
211
212enum {
213 LBR_NONE,
214 LBR_VALID,
215};
216
217static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
218{
219 int i;
220 unsigned lbr_idx, mask;
221 u64 tos;
222
223 if (task_ctx->lbr_callstack_users == 0 ||
224 task_ctx->lbr_stack_state == LBR_NONE) {
225 intel_pmu_lbr_reset();
226 return;
227 }
228
229 mask = x86_pmu.lbr_nr - 1;
230 tos = intel_pmu_lbr_tos();
231 for (i = 0; i < x86_pmu.lbr_nr; i++) {
232 lbr_idx = (tos - i) & mask;
233 wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
234 wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
235 }
236 task_ctx->lbr_stack_state = LBR_NONE;
237}
238
239static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
240{
241 int i;
242 unsigned lbr_idx, mask;
243 u64 tos;
244
245 if (task_ctx->lbr_callstack_users == 0) {
246 task_ctx->lbr_stack_state = LBR_NONE;
247 return;
248 }
249
250 mask = x86_pmu.lbr_nr - 1;
251 tos = intel_pmu_lbr_tos();
252 for (i = 0; i < x86_pmu.lbr_nr; i++) {
253 lbr_idx = (tos - i) & mask;
254 rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
255 rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
256 }
257 task_ctx->lbr_stack_state = LBR_VALID;
258}
259
260void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
261{
262 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
263 struct x86_perf_task_context *task_ctx;
264
265 if (!x86_pmu.lbr_nr)
266 return;
267
268 /*
269 * If LBR callstack feature is enabled and the stack was saved when
270 * the task was scheduled out, restore the stack. Otherwise flush
271 * the LBR stack.
272 */
273 task_ctx = ctx ? ctx->task_ctx_data : NULL;
274 if (task_ctx) {
275 if (sched_in) {
276 __intel_pmu_lbr_restore(task_ctx);
277 cpuc->lbr_context = ctx;
278 } else {
279 __intel_pmu_lbr_save(task_ctx);
280 }
281 return;
282 }
283
284 /*
285 * When sampling the branck stack in system-wide, it may be
286 * necessary to flush the stack on context switch. This happens
287 * when the branch stack does not tag its entries with the pid
288 * of the current task. Otherwise it becomes impossible to
289 * associate a branch entry with a task. This ambiguity is more
290 * likely to appear when the branch stack supports priv level
291 * filtering and the user sets it to monitor only at the user
292 * level (which could be a useful measurement in system-wide
293 * mode). In that case, the risk is high of having a branch
294 * stack with branch from multiple tasks.
295 */
296 if (sched_in) {
297 intel_pmu_lbr_reset();
298 cpuc->lbr_context = ctx;
299 }
300}
301
302static inline bool branch_user_callstack(unsigned br_sel)
303{
304 return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
305}
306
184void intel_pmu_lbr_enable(struct perf_event *event) 307void intel_pmu_lbr_enable(struct perf_event *event)
185{ 308{
186 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 309 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
310 struct x86_perf_task_context *task_ctx;
187 311
188 if (!x86_pmu.lbr_nr) 312 if (!x86_pmu.lbr_nr)
189 return; 313 return;
@@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event)
198 } 322 }
199 cpuc->br_sel = event->hw.branch_reg.reg; 323 cpuc->br_sel = event->hw.branch_reg.reg;
200 324
325 if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
326 event->ctx->task_ctx_data) {
327 task_ctx = event->ctx->task_ctx_data;
328 task_ctx->lbr_callstack_users++;
329 }
330
201 cpuc->lbr_users++; 331 cpuc->lbr_users++;
332 perf_sched_cb_inc(event->ctx->pmu);
202} 333}
203 334
204void intel_pmu_lbr_disable(struct perf_event *event) 335void intel_pmu_lbr_disable(struct perf_event *event)
205{ 336{
206 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 337 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
338 struct x86_perf_task_context *task_ctx;
207 339
208 if (!x86_pmu.lbr_nr) 340 if (!x86_pmu.lbr_nr)
209 return; 341 return;
210 342
343 if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
344 event->ctx->task_ctx_data) {
345 task_ctx = event->ctx->task_ctx_data;
346 task_ctx->lbr_callstack_users--;
347 }
348
211 cpuc->lbr_users--; 349 cpuc->lbr_users--;
212 WARN_ON_ONCE(cpuc->lbr_users < 0); 350 WARN_ON_ONCE(cpuc->lbr_users < 0);
351 perf_sched_cb_dec(event->ctx->pmu);
213 352
214 if (cpuc->enabled && !cpuc->lbr_users) { 353 if (cpuc->enabled && !cpuc->lbr_users) {
215 __intel_pmu_lbr_disable(); 354 __intel_pmu_lbr_disable();
@@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
218 } 357 }
219} 358}
220 359
221void intel_pmu_lbr_enable_all(void) 360void intel_pmu_lbr_enable_all(bool pmi)
222{ 361{
223 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 362 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
224 363
225 if (cpuc->lbr_users) 364 if (cpuc->lbr_users)
226 __intel_pmu_lbr_enable(); 365 __intel_pmu_lbr_enable(pmi);
227} 366}
228 367
229void intel_pmu_lbr_disable_all(void) 368void intel_pmu_lbr_disable_all(void)
@@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void)
234 __intel_pmu_lbr_disable(); 373 __intel_pmu_lbr_disable();
235} 374}
236 375
237/*
238 * TOS = most recently recorded branch
239 */
240static inline u64 intel_pmu_lbr_tos(void)
241{
242 u64 tos;
243
244 rdmsrl(x86_pmu.lbr_tos, tos);
245
246 return tos;
247}
248
249static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) 376static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
250{ 377{
251 unsigned long mask = x86_pmu.lbr_nr - 1; 378 unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void)
350 * - in case there is no HW filter 477 * - in case there is no HW filter
351 * - in case the HW filter has errata or limitations 478 * - in case the HW filter has errata or limitations
352 */ 479 */
353static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) 480static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
354{ 481{
355 u64 br_type = event->attr.branch_sample_type; 482 u64 br_type = event->attr.branch_sample_type;
356 int mask = 0; 483 int mask = 0;
@@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
387 if (br_type & PERF_SAMPLE_BRANCH_COND) 514 if (br_type & PERF_SAMPLE_BRANCH_COND)
388 mask |= X86_BR_JCC; 515 mask |= X86_BR_JCC;
389 516
517 if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
518 if (!x86_pmu_has_lbr_callstack())
519 return -EOPNOTSUPP;
520 if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
521 return -EINVAL;
522 mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
523 X86_BR_CALL_STACK;
524 }
525
390 /* 526 /*
391 * stash actual user request into reg, it may 527 * stash actual user request into reg, it may
392 * be used by fixup code for some CPU 528 * be used by fixup code for some CPU
393 */ 529 */
394 event->hw.branch_reg.reg = mask; 530 event->hw.branch_reg.reg = mask;
531 return 0;
395} 532}
396 533
397/* 534/*
@@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
403{ 540{
404 struct hw_perf_event_extra *reg; 541 struct hw_perf_event_extra *reg;
405 u64 br_type = event->attr.branch_sample_type; 542 u64 br_type = event->attr.branch_sample_type;
406 u64 mask = 0, m; 543 u64 mask = 0, v;
407 u64 v; 544 int i;
408 545
409 for_each_branch_sample_type(m) { 546 for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
410 if (!(br_type & m)) 547 if (!(br_type & (1ULL << i)))
411 continue; 548 continue;
412 549
413 v = x86_pmu.lbr_sel_map[m]; 550 v = x86_pmu.lbr_sel_map[i];
414 if (v == LBR_NOT_SUPP) 551 if (v == LBR_NOT_SUPP)
415 return -EOPNOTSUPP; 552 return -EOPNOTSUPP;
416 553
@@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
420 reg = &event->hw.branch_reg; 557 reg = &event->hw.branch_reg;
421 reg->idx = EXTRA_REG_LBR; 558 reg->idx = EXTRA_REG_LBR;
422 559
423 /* LBR_SELECT operates in suppress mode so invert mask */ 560 /*
424 reg->config = ~mask & x86_pmu.lbr_sel_mask; 561 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
562 * in suppress mode. So LBR_SELECT should be set to
563 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
564 */
565 reg->config = mask ^ x86_pmu.lbr_sel_mask;
425 566
426 return 0; 567 return 0;
427} 568}
@@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
439 /* 580 /*
440 * setup SW LBR filter 581 * setup SW LBR filter
441 */ 582 */
442 intel_pmu_setup_sw_lbr_filter(event); 583 ret = intel_pmu_setup_sw_lbr_filter(event);
584 if (ret)
585 return ret;
443 586
444 /* 587 /*
445 * setup HW LBR filter, if any 588 * setup HW LBR filter, if any
@@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
568 ret = X86_BR_INT; 711 ret = X86_BR_INT;
569 break; 712 break;
570 case 0xe8: /* call near rel */ 713 case 0xe8: /* call near rel */
714 insn_get_immediate(&insn);
715 if (insn.immediate1.value == 0) {
716 /* zero length call */
717 ret = X86_BR_ZERO_CALL;
718 break;
719 }
571 case 0x9a: /* call far absolute */ 720 case 0x9a: /* call far absolute */
572 ret = X86_BR_CALL; 721 ret = X86_BR_CALL;
573 break; 722 break;
@@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
678/* 827/*
679 * Map interface branch filters onto LBR filters 828 * Map interface branch filters onto LBR filters
680 */ 829 */
681static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { 830static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
682 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, 831 [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
683 [PERF_SAMPLE_BRANCH_USER] = LBR_USER, 832 [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
684 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, 833 [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
685 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, 834 [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
686 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP 835 [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
687 | LBR_IND_JMP | LBR_FAR, 836 | LBR_IND_JMP | LBR_FAR,
688 /* 837 /*
689 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches 838 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
690 */ 839 */
691 [PERF_SAMPLE_BRANCH_ANY_CALL] = 840 [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
692 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, 841 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
693 /* 842 /*
694 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL 843 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
695 */ 844 */
696 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, 845 [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
697 [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, 846 [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
698}; 847};
699 848
700static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { 849static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
701 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, 850 [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
702 [PERF_SAMPLE_BRANCH_USER] = LBR_USER, 851 [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
703 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, 852 [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
704 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, 853 [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
705 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, 854 [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
706 [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL 855 [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
707 | LBR_FAR, 856 | LBR_FAR,
708 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, 857 [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
709 [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, 858 [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
859};
860
861static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
862 [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
863 [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
864 [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
865 [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
866 [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
867 [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
868 | LBR_FAR,
869 [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
870 [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
871 [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
872 | LBR_RETURN | LBR_CALL_STACK,
710}; 873};
711 874
712/* core */ 875/* core */
@@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void)
765 pr_cont("16-deep LBR, "); 928 pr_cont("16-deep LBR, ");
766} 929}
767 930
931/* haswell */
932void intel_pmu_lbr_init_hsw(void)
933{
934 x86_pmu.lbr_nr = 16;
935 x86_pmu.lbr_tos = MSR_LBR_TOS;
936 x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
937 x86_pmu.lbr_to = MSR_LBR_NHM_TO;
938
939 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
940 x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
941
942 pr_cont("16-deep LBR, ");
943}
944
768/* atom */ 945/* atom */
769void __init intel_pmu_lbr_init_atom(void) 946void __init intel_pmu_lbr_init_atom(void)
770{ 947{
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..ffe666c2c6b5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1100 @@
1/*
2 * Intel(R) Processor Trace PMU driver for perf
3 * Copyright (c) 2013-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * Intel PT is specified in the Intel Architecture Instruction Set Extensions
15 * Programming Reference:
16 * http://software.intel.com/en-us/intel-isa-extensions
17 */
18
19#undef DEBUG
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/types.h>
24#include <linux/slab.h>
25#include <linux/device.h>
26
27#include <asm/perf_event.h>
28#include <asm/insn.h>
29#include <asm/io.h>
30
31#include "perf_event.h"
32#include "intel_pt.h"
33
34static DEFINE_PER_CPU(struct pt, pt_ctx);
35
36static struct pt_pmu pt_pmu;
37
38enum cpuid_regs {
39 CR_EAX = 0,
40 CR_ECX,
41 CR_EDX,
42 CR_EBX
43};
44
45/*
46 * Capabilities of Intel PT hardware, such as number of address bits or
47 * supported output schemes, are cached and exported to userspace as "caps"
48 * attribute group of pt pmu device
49 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
50 * relevant bits together with intel_pt traces.
51 *
52 * These are necessary for both trace decoding (payloads_lip, contains address
53 * width encoded in IP-related packets), and event configuration (bitmasks with
54 * permitted values for certain bit fields).
55 */
56#define PT_CAP(_n, _l, _r, _m) \
57 [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
58 .reg = _r, .mask = _m }
59
60static struct pt_cap_desc {
61 const char *name;
62 u32 leaf;
63 u8 reg;
64 u32 mask;
65} pt_caps[] = {
66 PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
67 PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
68 PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
69 PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
70 PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
71};
72
73static u32 pt_cap_get(enum pt_capabilities cap)
74{
75 struct pt_cap_desc *cd = &pt_caps[cap];
76 u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
77 unsigned int shift = __ffs(cd->mask);
78
79 return (c & cd->mask) >> shift;
80}
81
82static ssize_t pt_cap_show(struct device *cdev,
83 struct device_attribute *attr,
84 char *buf)
85{
86 struct dev_ext_attribute *ea =
87 container_of(attr, struct dev_ext_attribute, attr);
88 enum pt_capabilities cap = (long)ea->var;
89
90 return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
91}
92
93static struct attribute_group pt_cap_group = {
94 .name = "caps",
95};
96
97PMU_FORMAT_ATTR(tsc, "config:10" );
98PMU_FORMAT_ATTR(noretcomp, "config:11" );
99
100static struct attribute *pt_formats_attr[] = {
101 &format_attr_tsc.attr,
102 &format_attr_noretcomp.attr,
103 NULL,
104};
105
106static struct attribute_group pt_format_group = {
107 .name = "format",
108 .attrs = pt_formats_attr,
109};
110
111static const struct attribute_group *pt_attr_groups[] = {
112 &pt_cap_group,
113 &pt_format_group,
114 NULL,
115};
116
117static int __init pt_pmu_hw_init(void)
118{
119 struct dev_ext_attribute *de_attrs;
120 struct attribute **attrs;
121 size_t size;
122 int ret;
123 long i;
124
125 attrs = NULL;
126 ret = -ENODEV;
127 if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
128 goto fail;
129
130 for (i = 0; i < PT_CPUID_LEAVES; i++) {
131 cpuid_count(20, i,
132 &pt_pmu.caps[CR_EAX + i*4],
133 &pt_pmu.caps[CR_EBX + i*4],
134 &pt_pmu.caps[CR_ECX + i*4],
135 &pt_pmu.caps[CR_EDX + i*4]);
136 }
137
138 ret = -ENOMEM;
139 size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
140 attrs = kzalloc(size, GFP_KERNEL);
141 if (!attrs)
142 goto fail;
143
144 size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
145 de_attrs = kzalloc(size, GFP_KERNEL);
146 if (!de_attrs)
147 goto fail;
148
149 for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
150 struct dev_ext_attribute *de_attr = de_attrs + i;
151
152 de_attr->attr.attr.name = pt_caps[i].name;
153
154 sysfs_attr_init(&de_attrs->attr.attr);
155
156 de_attr->attr.attr.mode = S_IRUGO;
157 de_attr->attr.show = pt_cap_show;
158 de_attr->var = (void *)i;
159
160 attrs[i] = &de_attr->attr.attr;
161 }
162
163 pt_cap_group.attrs = attrs;
164
165 return 0;
166
167fail:
168 kfree(attrs);
169
170 return ret;
171}
172
173#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
174
175static bool pt_event_valid(struct perf_event *event)
176{
177 u64 config = event->attr.config;
178
179 if ((config & PT_CONFIG_MASK) != config)
180 return false;
181
182 return true;
183}
184
185/*
186 * PT configuration helpers
187 * These all are cpu affine and operate on a local PT
188 */
189
190static bool pt_is_running(void)
191{
192 u64 ctl;
193
194 rdmsrl(MSR_IA32_RTIT_CTL, ctl);
195
196 return !!(ctl & RTIT_CTL_TRACEEN);
197}
198
199static void pt_config(struct perf_event *event)
200{
201 u64 reg;
202
203 reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
204
205 if (!event->attr.exclude_kernel)
206 reg |= RTIT_CTL_OS;
207 if (!event->attr.exclude_user)
208 reg |= RTIT_CTL_USR;
209
210 reg |= (event->attr.config & PT_CONFIG_MASK);
211
212 wrmsrl(MSR_IA32_RTIT_CTL, reg);
213}
214
215static void pt_config_start(bool start)
216{
217 u64 ctl;
218
219 rdmsrl(MSR_IA32_RTIT_CTL, ctl);
220 if (start)
221 ctl |= RTIT_CTL_TRACEEN;
222 else
223 ctl &= ~RTIT_CTL_TRACEEN;
224 wrmsrl(MSR_IA32_RTIT_CTL, ctl);
225
226 /*
227 * A wrmsr that disables trace generation serializes other PT
228 * registers and causes all data packets to be written to memory,
229 * but a fence is required for the data to become globally visible.
230 *
231 * The below WMB, separating data store and aux_head store matches
232 * the consumer's RMB that separates aux_head load and data load.
233 */
234 if (!start)
235 wmb();
236}
237
238static void pt_config_buffer(void *buf, unsigned int topa_idx,
239 unsigned int output_off)
240{
241 u64 reg;
242
243 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
244
245 reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
246
247 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
248}
249
250/*
251 * Keep ToPA table-related metadata on the same page as the actual table,
252 * taking up a few words from the top
253 */
254
255#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
256
257/**
258 * struct topa - page-sized ToPA table with metadata at the top
259 * @table: actual ToPA table entries, as understood by PT hardware
260 * @list: linkage to struct pt_buffer's list of tables
261 * @phys: physical address of this page
262 * @offset: offset of the first entry in this table in the buffer
263 * @size: total size of all entries in this table
264 * @last: index of the last initialized entry in this table
265 */
266struct topa {
267 struct topa_entry table[TENTS_PER_PAGE];
268 struct list_head list;
269 u64 phys;
270 u64 offset;
271 size_t size;
272 int last;
273};
274
275/* make -1 stand for the last table entry */
276#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
277
278/**
279 * topa_alloc() - allocate page-sized ToPA table
280 * @cpu: CPU on which to allocate.
281 * @gfp: Allocation flags.
282 *
283 * Return: On success, return the pointer to ToPA table page.
284 */
285static struct topa *topa_alloc(int cpu, gfp_t gfp)
286{
287 int node = cpu_to_node(cpu);
288 struct topa *topa;
289 struct page *p;
290
291 p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
292 if (!p)
293 return NULL;
294
295 topa = page_address(p);
296 topa->last = 0;
297 topa->phys = page_to_phys(p);
298
299 /*
300 * In case of singe-entry ToPA, always put the self-referencing END
301 * link as the 2nd entry in the table
302 */
303 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
304 TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
305 TOPA_ENTRY(topa, 1)->end = 1;
306 }
307
308 return topa;
309}
310
311/**
312 * topa_free() - free a page-sized ToPA table
313 * @topa: Table to deallocate.
314 */
315static void topa_free(struct topa *topa)
316{
317 free_page((unsigned long)topa);
318}
319
320/**
321 * topa_insert_table() - insert a ToPA table into a buffer
322 * @buf: PT buffer that's being extended.
323 * @topa: New topa table to be inserted.
324 *
325 * If it's the first table in this buffer, set up buffer's pointers
326 * accordingly; otherwise, add a END=1 link entry to @topa to the current
327 * "last" table and adjust the last table pointer to @topa.
328 */
329static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
330{
331 struct topa *last = buf->last;
332
333 list_add_tail(&topa->list, &buf->tables);
334
335 if (!buf->first) {
336 buf->first = buf->last = buf->cur = topa;
337 return;
338 }
339
340 topa->offset = last->offset + last->size;
341 buf->last = topa;
342
343 if (!pt_cap_get(PT_CAP_topa_multiple_entries))
344 return;
345
346 BUG_ON(last->last != TENTS_PER_PAGE - 1);
347
348 TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
349 TOPA_ENTRY(last, -1)->end = 1;
350}
351
352/**
353 * topa_table_full() - check if a ToPA table is filled up
354 * @topa: ToPA table.
355 */
356static bool topa_table_full(struct topa *topa)
357{
358 /* single-entry ToPA is a special case */
359 if (!pt_cap_get(PT_CAP_topa_multiple_entries))
360 return !!topa->last;
361
362 return topa->last == TENTS_PER_PAGE - 1;
363}
364
365/**
366 * topa_insert_pages() - create a list of ToPA tables
367 * @buf: PT buffer being initialized.
368 * @gfp: Allocation flags.
369 *
370 * This initializes a list of ToPA tables with entries from
371 * the data_pages provided by rb_alloc_aux().
372 *
373 * Return: 0 on success or error code.
374 */
375static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
376{
377 struct topa *topa = buf->last;
378 int order = 0;
379 struct page *p;
380
381 p = virt_to_page(buf->data_pages[buf->nr_pages]);
382 if (PagePrivate(p))
383 order = page_private(p);
384
385 if (topa_table_full(topa)) {
386 topa = topa_alloc(buf->cpu, gfp);
387 if (!topa)
388 return -ENOMEM;
389
390 topa_insert_table(buf, topa);
391 }
392
393 TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
394 TOPA_ENTRY(topa, -1)->size = order;
395 if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
396 TOPA_ENTRY(topa, -1)->intr = 1;
397 TOPA_ENTRY(topa, -1)->stop = 1;
398 }
399
400 topa->last++;
401 topa->size += sizes(order);
402
403 buf->nr_pages += 1ul << order;
404
405 return 0;
406}
407
408/**
409 * pt_topa_dump() - print ToPA tables and their entries
410 * @buf: PT buffer.
411 */
412static void pt_topa_dump(struct pt_buffer *buf)
413{
414 struct topa *topa;
415
416 list_for_each_entry(topa, &buf->tables, list) {
417 int i;
418
419 pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
420 topa->phys, topa->offset, topa->size);
421 for (i = 0; i < TENTS_PER_PAGE; i++) {
422 pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
423 &topa->table[i],
424 (unsigned long)topa->table[i].base << TOPA_SHIFT,
425 sizes(topa->table[i].size),
426 topa->table[i].end ? 'E' : ' ',
427 topa->table[i].intr ? 'I' : ' ',
428 topa->table[i].stop ? 'S' : ' ',
429 *(u64 *)&topa->table[i]);
430 if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
431 topa->table[i].stop) ||
432 topa->table[i].end)
433 break;
434 }
435 }
436}
437
438/**
439 * pt_buffer_advance() - advance to the next output region
440 * @buf: PT buffer.
441 *
442 * Advance the current pointers in the buffer to the next ToPA entry.
443 */
444static void pt_buffer_advance(struct pt_buffer *buf)
445{
446 buf->output_off = 0;
447 buf->cur_idx++;
448
449 if (buf->cur_idx == buf->cur->last) {
450 if (buf->cur == buf->last)
451 buf->cur = buf->first;
452 else
453 buf->cur = list_entry(buf->cur->list.next, struct topa,
454 list);
455 buf->cur_idx = 0;
456 }
457}
458
459/**
460 * pt_update_head() - calculate current offsets and sizes
461 * @pt: Per-cpu pt context.
462 *
463 * Update buffer's current write pointer position and data size.
464 */
465static void pt_update_head(struct pt *pt)
466{
467 struct pt_buffer *buf = perf_get_aux(&pt->handle);
468 u64 topa_idx, base, old;
469
470 /* offset of the first region in this table from the beginning of buf */
471 base = buf->cur->offset + buf->output_off;
472
473 /* offset of the current output region within this table */
474 for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
475 base += sizes(buf->cur->table[topa_idx].size);
476
477 if (buf->snapshot) {
478 local_set(&buf->data_size, base);
479 } else {
480 old = (local64_xchg(&buf->head, base) &
481 ((buf->nr_pages << PAGE_SHIFT) - 1));
482 if (base < old)
483 base += buf->nr_pages << PAGE_SHIFT;
484
485 local_add(base - old, &buf->data_size);
486 }
487}
488
489/**
490 * pt_buffer_region() - obtain current output region's address
491 * @buf: PT buffer.
492 */
493static void *pt_buffer_region(struct pt_buffer *buf)
494{
495 return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
496}
497
498/**
499 * pt_buffer_region_size() - obtain current output region's size
500 * @buf: PT buffer.
501 */
502static size_t pt_buffer_region_size(struct pt_buffer *buf)
503{
504 return sizes(buf->cur->table[buf->cur_idx].size);
505}
506
507/**
508 * pt_handle_status() - take care of possible status conditions
509 * @pt: Per-cpu pt context.
510 */
511static void pt_handle_status(struct pt *pt)
512{
513 struct pt_buffer *buf = perf_get_aux(&pt->handle);
514 int advance = 0;
515 u64 status;
516
517 rdmsrl(MSR_IA32_RTIT_STATUS, status);
518
519 if (status & RTIT_STATUS_ERROR) {
520 pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
521 pt_topa_dump(buf);
522 status &= ~RTIT_STATUS_ERROR;
523 }
524
525 if (status & RTIT_STATUS_STOPPED) {
526 status &= ~RTIT_STATUS_STOPPED;
527
528 /*
529 * On systems that only do single-entry ToPA, hitting STOP
530 * means we are already losing data; need to let the decoder
531 * know.
532 */
533 if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
534 buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
535 local_inc(&buf->lost);
536 advance++;
537 }
538 }
539
540 /*
541 * Also on single-entry ToPA implementations, interrupt will come
542 * before the output reaches its output region's boundary.
543 */
544 if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
545 pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
546 void *head = pt_buffer_region(buf);
547
548 /* everything within this margin needs to be zeroed out */
549 memset(head + buf->output_off, 0,
550 pt_buffer_region_size(buf) -
551 buf->output_off);
552 advance++;
553 }
554
555 if (advance)
556 pt_buffer_advance(buf);
557
558 wrmsrl(MSR_IA32_RTIT_STATUS, status);
559}
560
561/**
562 * pt_read_offset() - translate registers into buffer pointers
563 * @buf: PT buffer.
564 *
565 * Set buffer's output pointers from MSR values.
566 */
567static void pt_read_offset(struct pt_buffer *buf)
568{
569 u64 offset, base_topa;
570
571 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
572 buf->cur = phys_to_virt(base_topa);
573
574 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
575 /* offset within current output region */
576 buf->output_off = offset >> 32;
577 /* index of current output region within this table */
578 buf->cur_idx = (offset & 0xffffff80) >> 7;
579}
580
581/**
582 * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
583 * @buf: PT buffer.
584 * @pg: Page offset in the buffer.
585 *
586 * When advancing to the next output region (ToPA entry), given a page offset
587 * into the buffer, we need to find the offset of the first page in the next
588 * region.
589 */
590static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
591{
592 struct topa_entry *te = buf->topa_index[pg];
593
594 /* one region */
595 if (buf->first == buf->last && buf->first->last == 1)
596 return pg;
597
598 do {
599 pg++;
600 pg &= buf->nr_pages - 1;
601 } while (buf->topa_index[pg] == te);
602
603 return pg;
604}
605
606/**
607 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
608 * @buf: PT buffer.
609 * @handle: Current output handle.
610 *
611 * Place INT and STOP marks to prevent overwriting old data that the consumer
612 * hasn't yet collected.
613 */
614static int pt_buffer_reset_markers(struct pt_buffer *buf,
615 struct perf_output_handle *handle)
616
617{
618 unsigned long idx, npages, end;
619
620 if (buf->snapshot)
621 return 0;
622
623 /* can't stop in the middle of an output region */
624 if (buf->output_off + handle->size + 1 <
625 sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
626 return -EINVAL;
627
628
629 /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
630 if (!pt_cap_get(PT_CAP_topa_multiple_entries))
631 return 0;
632
633 /* clear STOP and INT from current entry */
634 buf->topa_index[buf->stop_pos]->stop = 0;
635 buf->topa_index[buf->intr_pos]->intr = 0;
636
637 if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
638 npages = (handle->size + 1) >> PAGE_SHIFT;
639 end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
640 /*if (end > handle->wakeup >> PAGE_SHIFT)
641 end = handle->wakeup >> PAGE_SHIFT;*/
642 idx = end & (buf->nr_pages - 1);
643 buf->stop_pos = idx;
644 idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
645 idx &= buf->nr_pages - 1;
646 buf->intr_pos = idx;
647 }
648
649 buf->topa_index[buf->stop_pos]->stop = 1;
650 buf->topa_index[buf->intr_pos]->intr = 1;
651
652 return 0;
653}
654
655/**
656 * pt_buffer_setup_topa_index() - build topa_index[] table of regions
657 * @buf: PT buffer.
658 *
659 * topa_index[] references output regions indexed by offset into the
660 * buffer for purposes of quick reverse lookup.
661 */
662static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
663{
664 struct topa *cur = buf->first, *prev = buf->last;
665 struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
666 *te_prev = TOPA_ENTRY(prev, prev->last - 1);
667 int pg = 0, idx = 0, ntopa = 0;
668
669 while (pg < buf->nr_pages) {
670 int tidx;
671
672 /* pages within one topa entry */
673 for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
674 buf->topa_index[pg] = te_prev;
675
676 te_prev = te_cur;
677
678 if (idx == cur->last - 1) {
679 /* advance to next topa table */
680 idx = 0;
681 cur = list_entry(cur->list.next, struct topa, list);
682 ntopa++;
683 } else
684 idx++;
685 te_cur = TOPA_ENTRY(cur, idx);
686 }
687
688}
689
690/**
691 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
692 * @buf: PT buffer.
693 * @head: Write pointer (aux_head) from AUX buffer.
694 *
695 * Find the ToPA table and entry corresponding to given @head and set buffer's
696 * "current" pointers accordingly.
697 */
698static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
699{
700 int pg;
701
702 if (buf->snapshot)
703 head &= (buf->nr_pages << PAGE_SHIFT) - 1;
704
705 pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
706 pg = pt_topa_next_entry(buf, pg);
707
708 buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
709 buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
710 (unsigned long)buf->cur) / sizeof(struct topa_entry);
711 buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
712
713 local64_set(&buf->head, head);
714 local_set(&buf->data_size, 0);
715}
716
717/**
718 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
719 * @buf: PT buffer.
720 */
721static void pt_buffer_fini_topa(struct pt_buffer *buf)
722{
723 struct topa *topa, *iter;
724
725 list_for_each_entry_safe(topa, iter, &buf->tables, list) {
726 /*
727 * right now, this is in free_aux() path only, so
728 * no need to unlink this table from the list
729 */
730 topa_free(topa);
731 }
732}
733
734/**
735 * pt_buffer_init_topa() - initialize ToPA table for pt buffer
736 * @buf: PT buffer.
737 * @size: Total size of all regions within this ToPA.
738 * @gfp: Allocation flags.
739 */
740static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
741 gfp_t gfp)
742{
743 struct topa *topa;
744 int err;
745
746 topa = topa_alloc(buf->cpu, gfp);
747 if (!topa)
748 return -ENOMEM;
749
750 topa_insert_table(buf, topa);
751
752 while (buf->nr_pages < nr_pages) {
753 err = topa_insert_pages(buf, gfp);
754 if (err) {
755 pt_buffer_fini_topa(buf);
756 return -ENOMEM;
757 }
758 }
759
760 pt_buffer_setup_topa_index(buf);
761
762 /* link last table to the first one, unless we're double buffering */
763 if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
764 TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
765 TOPA_ENTRY(buf->last, -1)->end = 1;
766 }
767
768 pt_topa_dump(buf);
769 return 0;
770}
771
772/**
773 * pt_buffer_setup_aux() - set up topa tables for a PT buffer
774 * @cpu: Cpu on which to allocate, -1 means current.
775 * @pages: Array of pointers to buffer pages passed from perf core.
776 * @nr_pages: Number of pages in the buffer.
777 * @snapshot: If this is a snapshot/overwrite counter.
778 *
779 * This is a pmu::setup_aux callback that sets up ToPA tables and all the
780 * bookkeeping for an AUX buffer.
781 *
782 * Return: Our private PT buffer structure.
783 */
784static void *
785pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
786{
787 struct pt_buffer *buf;
788 int node, ret;
789
790 if (!nr_pages)
791 return NULL;
792
793 if (cpu == -1)
794 cpu = raw_smp_processor_id();
795 node = cpu_to_node(cpu);
796
797 buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
798 GFP_KERNEL, node);
799 if (!buf)
800 return NULL;
801
802 buf->cpu = cpu;
803 buf->snapshot = snapshot;
804 buf->data_pages = pages;
805
806 INIT_LIST_HEAD(&buf->tables);
807
808 ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
809 if (ret) {
810 kfree(buf);
811 return NULL;
812 }
813
814 return buf;
815}
816
817/**
818 * pt_buffer_free_aux() - perf AUX deallocation path callback
819 * @data: PT buffer.
820 */
821static void pt_buffer_free_aux(void *data)
822{
823 struct pt_buffer *buf = data;
824
825 pt_buffer_fini_topa(buf);
826 kfree(buf);
827}
828
829/**
830 * pt_buffer_is_full() - check if the buffer is full
831 * @buf: PT buffer.
832 * @pt: Per-cpu pt handle.
833 *
834 * If the user hasn't read data from the output region that aux_head
835 * points to, the buffer is considered full: the user needs to read at
836 * least this region and update aux_tail to point past it.
837 */
838static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
839{
840 if (buf->snapshot)
841 return false;
842
843 if (local_read(&buf->data_size) >= pt->handle.size)
844 return true;
845
846 return false;
847}
848
849/**
850 * intel_pt_interrupt() - PT PMI handler
851 */
852void intel_pt_interrupt(void)
853{
854 struct pt *pt = this_cpu_ptr(&pt_ctx);
855 struct pt_buffer *buf;
856 struct perf_event *event = pt->handle.event;
857
858 /*
859 * There may be a dangling PT bit in the interrupt status register
860 * after PT has been disabled by pt_event_stop(). Make sure we don't
861 * do anything (particularly, re-enable) for this event here.
862 */
863 if (!ACCESS_ONCE(pt->handle_nmi))
864 return;
865
866 pt_config_start(false);
867
868 if (!event)
869 return;
870
871 buf = perf_get_aux(&pt->handle);
872 if (!buf)
873 return;
874
875 pt_read_offset(buf);
876
877 pt_handle_status(pt);
878
879 pt_update_head(pt);
880
881 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
882 local_xchg(&buf->lost, 0));
883
884 if (!event->hw.state) {
885 int ret;
886
887 buf = perf_aux_output_begin(&pt->handle, event);
888 if (!buf) {
889 event->hw.state = PERF_HES_STOPPED;
890 return;
891 }
892
893 pt_buffer_reset_offsets(buf, pt->handle.head);
894 ret = pt_buffer_reset_markers(buf, &pt->handle);
895 if (ret) {
896 perf_aux_output_end(&pt->handle, 0, true);
897 return;
898 }
899
900 pt_config_buffer(buf->cur->table, buf->cur_idx,
901 buf->output_off);
902 wrmsrl(MSR_IA32_RTIT_STATUS, 0);
903 pt_config(event);
904 }
905}
906
907/*
908 * PMU callbacks
909 */
910
911static void pt_event_start(struct perf_event *event, int mode)
912{
913 struct pt *pt = this_cpu_ptr(&pt_ctx);
914 struct pt_buffer *buf = perf_get_aux(&pt->handle);
915
916 if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
917 event->hw.state = PERF_HES_STOPPED;
918 return;
919 }
920
921 ACCESS_ONCE(pt->handle_nmi) = 1;
922 event->hw.state = 0;
923
924 pt_config_buffer(buf->cur->table, buf->cur_idx,
925 buf->output_off);
926 wrmsrl(MSR_IA32_RTIT_STATUS, 0);
927 pt_config(event);
928}
929
930static void pt_event_stop(struct perf_event *event, int mode)
931{
932 struct pt *pt = this_cpu_ptr(&pt_ctx);
933
934 /*
935 * Protect against the PMI racing with disabling wrmsr,
936 * see comment in intel_pt_interrupt().
937 */
938 ACCESS_ONCE(pt->handle_nmi) = 0;
939 pt_config_start(false);
940
941 if (event->hw.state == PERF_HES_STOPPED)
942 return;
943
944 event->hw.state = PERF_HES_STOPPED;
945
946 if (mode & PERF_EF_UPDATE) {
947 struct pt *pt = this_cpu_ptr(&pt_ctx);
948 struct pt_buffer *buf = perf_get_aux(&pt->handle);
949
950 if (!buf)
951 return;
952
953 if (WARN_ON_ONCE(pt->handle.event != event))
954 return;
955
956 pt_read_offset(buf);
957
958 pt_handle_status(pt);
959
960 pt_update_head(pt);
961 }
962}
963
964static void pt_event_del(struct perf_event *event, int mode)
965{
966 struct pt *pt = this_cpu_ptr(&pt_ctx);
967 struct pt_buffer *buf;
968
969 pt_event_stop(event, PERF_EF_UPDATE);
970
971 buf = perf_get_aux(&pt->handle);
972
973 if (buf) {
974 if (buf->snapshot)
975 pt->handle.head =
976 local_xchg(&buf->data_size,
977 buf->nr_pages << PAGE_SHIFT);
978 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
979 local_xchg(&buf->lost, 0));
980 }
981}
982
983static int pt_event_add(struct perf_event *event, int mode)
984{
985 struct pt_buffer *buf;
986 struct pt *pt = this_cpu_ptr(&pt_ctx);
987 struct hw_perf_event *hwc = &event->hw;
988 int ret = -EBUSY;
989
990 if (pt->handle.event)
991 goto fail;
992
993 buf = perf_aux_output_begin(&pt->handle, event);
994 ret = -EINVAL;
995 if (!buf)
996 goto fail_stop;
997
998 pt_buffer_reset_offsets(buf, pt->handle.head);
999 if (!buf->snapshot) {
1000 ret = pt_buffer_reset_markers(buf, &pt->handle);
1001 if (ret)
1002 goto fail_end_stop;
1003 }
1004
1005 if (mode & PERF_EF_START) {
1006 pt_event_start(event, 0);
1007 ret = -EBUSY;
1008 if (hwc->state == PERF_HES_STOPPED)
1009 goto fail_end_stop;
1010 } else {
1011 hwc->state = PERF_HES_STOPPED;
1012 }
1013
1014 return 0;
1015
1016fail_end_stop:
1017 perf_aux_output_end(&pt->handle, 0, true);
1018fail_stop:
1019 hwc->state = PERF_HES_STOPPED;
1020fail:
1021 return ret;
1022}
1023
1024static void pt_event_read(struct perf_event *event)
1025{
1026}
1027
1028static void pt_event_destroy(struct perf_event *event)
1029{
1030 x86_del_exclusive(x86_lbr_exclusive_pt);
1031}
1032
1033static int pt_event_init(struct perf_event *event)
1034{
1035 if (event->attr.type != pt_pmu.pmu.type)
1036 return -ENOENT;
1037
1038 if (!pt_event_valid(event))
1039 return -EINVAL;
1040
1041 if (x86_add_exclusive(x86_lbr_exclusive_pt))
1042 return -EBUSY;
1043
1044 event->destroy = pt_event_destroy;
1045
1046 return 0;
1047}
1048
1049static __init int pt_init(void)
1050{
1051 int ret, cpu, prior_warn = 0;
1052
1053 BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1054 get_online_cpus();
1055 for_each_online_cpu(cpu) {
1056 u64 ctl;
1057
1058 ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1059 if (!ret && (ctl & RTIT_CTL_TRACEEN))
1060 prior_warn++;
1061 }
1062 put_online_cpus();
1063
1064 if (prior_warn) {
1065 x86_add_exclusive(x86_lbr_exclusive_pt);
1066 pr_warn("PT is enabled at boot time, doing nothing\n");
1067
1068 return -EBUSY;
1069 }
1070
1071 ret = pt_pmu_hw_init();
1072 if (ret)
1073 return ret;
1074
1075 if (!pt_cap_get(PT_CAP_topa_output)) {
1076 pr_warn("ToPA output is not supported on this CPU\n");
1077 return -ENODEV;
1078 }
1079
1080 if (!pt_cap_get(PT_CAP_topa_multiple_entries))
1081 pt_pmu.pmu.capabilities =
1082 PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
1083
1084 pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1085 pt_pmu.pmu.attr_groups = pt_attr_groups;
1086 pt_pmu.pmu.task_ctx_nr = perf_sw_context;
1087 pt_pmu.pmu.event_init = pt_event_init;
1088 pt_pmu.pmu.add = pt_event_add;
1089 pt_pmu.pmu.del = pt_event_del;
1090 pt_pmu.pmu.start = pt_event_start;
1091 pt_pmu.pmu.stop = pt_event_stop;
1092 pt_pmu.pmu.read = pt_event_read;
1093 pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
1094 pt_pmu.pmu.free_aux = pt_buffer_free_aux;
1095 ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1096
1097 return ret;
1098}
1099
1100module_init(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index c4bb8b8e5017..999289b94025 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -62,6 +62,14 @@
62#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ 62#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
63#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ 63#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
64 64
65#define NR_RAPL_DOMAINS 0x4
66static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
67 "pp0-core",
68 "package",
69 "dram",
70 "pp1-gpu",
71};
72
65/* Clients have PP0, PKG */ 73/* Clients have PP0, PKG */
66#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ 74#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
67 1<<RAPL_IDX_PKG_NRG_STAT|\ 75 1<<RAPL_IDX_PKG_NRG_STAT|\
@@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v = { \
112 120
113struct rapl_pmu { 121struct rapl_pmu {
114 spinlock_t lock; 122 spinlock_t lock;
115 int hw_unit; /* 1/2^hw_unit Joule */
116 int n_active; /* number of active events */ 123 int n_active; /* number of active events */
117 struct list_head active_list; 124 struct list_head active_list;
118 struct pmu *pmu; /* pointer to rapl_pmu_class */ 125 struct pmu *pmu; /* pointer to rapl_pmu_class */
@@ -120,6 +127,7 @@ struct rapl_pmu {
120 struct hrtimer hrtimer; 127 struct hrtimer hrtimer;
121}; 128};
122 129
130static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
123static struct pmu rapl_pmu_class; 131static struct pmu rapl_pmu_class;
124static cpumask_t rapl_cpu_mask; 132static cpumask_t rapl_cpu_mask;
125static int rapl_cntr_mask; 133static int rapl_cntr_mask;
@@ -127,6 +135,7 @@ static int rapl_cntr_mask;
127static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); 135static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
128static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); 136static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
129 137
138static struct x86_pmu_quirk *rapl_quirks;
130static inline u64 rapl_read_counter(struct perf_event *event) 139static inline u64 rapl_read_counter(struct perf_event *event)
131{ 140{
132 u64 raw; 141 u64 raw;
@@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct perf_event *event)
134 return raw; 143 return raw;
135} 144}
136 145
137static inline u64 rapl_scale(u64 v) 146#define rapl_add_quirk(func_) \
147do { \
148 static struct x86_pmu_quirk __quirk __initdata = { \
149 .func = func_, \
150 }; \
151 __quirk.next = rapl_quirks; \
152 rapl_quirks = &__quirk; \
153} while (0)
154
155static inline u64 rapl_scale(u64 v, int cfg)
138{ 156{
157 if (cfg > NR_RAPL_DOMAINS) {
158 pr_warn("invalid domain %d, failed to scale data\n", cfg);
159 return v;
160 }
139 /* 161 /*
140 * scale delta to smallest unit (1/2^32) 162 * scale delta to smallest unit (1/2^32)
141 * users must then scale back: count * 1/(1e9*2^32) to get Joules 163 * users must then scale back: count * 1/(1e9*2^32) to get Joules
142 * or use ldexp(count, -32). 164 * or use ldexp(count, -32).
143 * Watts = Joules/Time delta 165 * Watts = Joules/Time delta
144 */ 166 */
145 return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit); 167 return v << (32 - rapl_hw_unit[cfg - 1]);
146} 168}
147 169
148static u64 rapl_event_update(struct perf_event *event) 170static u64 rapl_event_update(struct perf_event *event)
@@ -173,7 +195,7 @@ again:
173 delta = (new_raw_count << shift) - (prev_raw_count << shift); 195 delta = (new_raw_count << shift) - (prev_raw_count << shift);
174 delta >>= shift; 196 delta >>= shift;
175 197
176 sdelta = rapl_scale(delta); 198 sdelta = rapl_scale(delta, event->hw.config);
177 199
178 local64_add(sdelta, &event->count); 200 local64_add(sdelta, &event->count);
179 201
@@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu)
546 cpumask_set_cpu(cpu, &rapl_cpu_mask); 568 cpumask_set_cpu(cpu, &rapl_cpu_mask);
547} 569}
548 570
571static __init void rapl_hsw_server_quirk(void)
572{
573 /*
574 * DRAM domain on HSW server has fixed energy unit which can be
575 * different than the unit from power unit MSR.
576 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
577 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
578 */
579 rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
580}
581
549static int rapl_cpu_prepare(int cpu) 582static int rapl_cpu_prepare(int cpu)
550{ 583{
551 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); 584 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
552 int phys_id = topology_physical_package_id(cpu); 585 int phys_id = topology_physical_package_id(cpu);
553 u64 ms; 586 u64 ms;
554 u64 msr_rapl_power_unit_bits;
555 587
556 if (pmu) 588 if (pmu)
557 return 0; 589 return 0;
@@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu)
559 if (phys_id < 0) 591 if (phys_id < 0)
560 return -1; 592 return -1;
561 593
562 /* protect rdmsrl() to handle virtualization */
563 if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
564 return -1;
565
566 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); 594 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
567 if (!pmu) 595 if (!pmu)
568 return -1; 596 return -1;
569
570 spin_lock_init(&pmu->lock); 597 spin_lock_init(&pmu->lock);
571 598
572 INIT_LIST_HEAD(&pmu->active_list); 599 INIT_LIST_HEAD(&pmu->active_list);
573 600
574 /*
575 * grab power unit as: 1/2^unit Joules
576 *
577 * we cache in local PMU instance
578 */
579 pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
580 pmu->pmu = &rapl_pmu_class; 601 pmu->pmu = &rapl_pmu_class;
581 602
582 /* 603 /*
@@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu)
586 * divide interval by 2 to avoid lockstep (2 * 100) 607 * divide interval by 2 to avoid lockstep (2 * 100)
587 * if hw unit is 32, then we use 2 ms 1/200/2 608 * if hw unit is 32, then we use 2 ms 1/200/2
588 */ 609 */
589 if (pmu->hw_unit < 32) 610 if (rapl_hw_unit[0] < 32)
590 ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); 611 ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
591 else 612 else
592 ms = 2; 613 ms = 2;
593 614
@@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self,
655 return NOTIFY_OK; 676 return NOTIFY_OK;
656} 677}
657 678
679static int rapl_check_hw_unit(void)
680{
681 u64 msr_rapl_power_unit_bits;
682 int i;
683
684 /* protect rdmsrl() to handle virtualization */
685 if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
686 return -1;
687 for (i = 0; i < NR_RAPL_DOMAINS; i++)
688 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
689
690 return 0;
691}
692
658static const struct x86_cpu_id rapl_cpu_match[] = { 693static const struct x86_cpu_id rapl_cpu_match[] = {
659 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, 694 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
660 [1] = {}, 695 [1] = {},
@@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void)
664{ 699{
665 struct rapl_pmu *pmu; 700 struct rapl_pmu *pmu;
666 int cpu, ret; 701 int cpu, ret;
702 struct x86_pmu_quirk *quirk;
703 int i;
667 704
668 /* 705 /*
669 * check for Intel processor family 6 706 * check for Intel processor family 6
@@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void)
678 rapl_cntr_mask = RAPL_IDX_CLN; 715 rapl_cntr_mask = RAPL_IDX_CLN;
679 rapl_pmu_events_group.attrs = rapl_events_cln_attr; 716 rapl_pmu_events_group.attrs = rapl_events_cln_attr;
680 break; 717 break;
718 case 63: /* Haswell-Server */
719 rapl_add_quirk(rapl_hsw_server_quirk);
720 rapl_cntr_mask = RAPL_IDX_SRV;
721 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
722 break;
681 case 60: /* Haswell */ 723 case 60: /* Haswell */
682 case 69: /* Haswell-Celeron */ 724 case 69: /* Haswell-Celeron */
683 rapl_cntr_mask = RAPL_IDX_HSW; 725 rapl_cntr_mask = RAPL_IDX_HSW;
@@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void)
693 /* unsupported */ 735 /* unsupported */
694 return 0; 736 return 0;
695 } 737 }
738 ret = rapl_check_hw_unit();
739 if (ret)
740 return ret;
696 741
742 /* run cpu model quirks */
743 for (quirk = rapl_quirks; quirk; quirk = quirk->next)
744 quirk->func();
697 cpu_notifier_register_begin(); 745 cpu_notifier_register_begin();
698 746
699 for_each_online_cpu(cpu) { 747 for_each_online_cpu(cpu) {
@@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void)
714 762
715 pmu = __this_cpu_read(rapl_pmu); 763 pmu = __this_cpu_read(rapl_pmu);
716 764
717 pr_info("RAPL PMU detected, hw unit 2^-%d Joules," 765 pr_info("RAPL PMU detected,"
718 " API unit is 2^-32 Joules," 766 " API unit is 2^-32 Joules,"
719 " %d fixed counters" 767 " %d fixed counters"
720 " %llu ms ovfl timer\n", 768 " %llu ms ovfl timer\n",
721 pmu->hw_unit,
722 hweight32(rapl_cntr_mask), 769 hweight32(rapl_cntr_mask),
723 ktime_to_ms(pmu->timer_interval)); 770 ktime_to_ms(pmu->timer_interval));
724 771 for (i = 0; i < NR_RAPL_DOMAINS; i++) {
772 if (rapl_cntr_mask & (1 << i)) {
773 pr_info("hw unit of domain %s 2^-%d Joules\n",
774 rapl_domain_names[i], rapl_hw_unit[i]);
775 }
776 }
725out: 777out:
726 cpu_notifier_register_done(); 778 cpu_notifier_register_done();
727 779
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 21af6149edf2..12d9548457e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
1132 } 1132 }
1133 } 1133 }
1134 1134
1135 if (ubox_dev) 1135 pci_dev_put(ubox_dev);
1136 pci_dev_put(ubox_dev);
1137 1136
1138 return err ? pcibios_err_to_errno(err) : 0; 1137 return err ? pcibios_err_to_errno(err) : 0;
1139} 1138}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 60639093d536..3d423a101fae 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
41 { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, 41 { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
42 { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, 42 { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
43 { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, 43 { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
44 { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
44 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, 45 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
45 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, 46 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
46 { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, 47 { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index aceb2f90c716..c76d3e37c6e1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
105#ifdef CONFIG_X86_32 105#ifdef CONFIG_X86_32
106 struct pt_regs fixed_regs; 106 struct pt_regs fixed_regs;
107 107
108 if (!user_mode_vm(regs)) { 108 if (!user_mode(regs)) {
109 crash_fixup_ss_esp(&fixed_regs, regs); 109 crash_fixup_ss_esp(&fixed_regs, regs);
110 regs = &fixed_regs; 110 regs = &fixed_regs;
111 } 111 }
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3d3503351242..6367a780cc8c 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void)
286 initial_boot_params = dt = early_memremap(initial_dtb, map_len); 286 initial_boot_params = dt = early_memremap(initial_dtb, map_len);
287 size = of_get_flat_dt_size(); 287 size = of_get_flat_dt_size();
288 if (map_len < size) { 288 if (map_len < size) {
289 early_iounmap(dt, map_len); 289 early_memunmap(dt, map_len);
290 initial_boot_params = dt = early_memremap(initial_dtb, size); 290 initial_boot_params = dt = early_memremap(initial_dtb, size);
291 map_len = size; 291 map_len = size;
292 } 292 }
293 293
294 unflatten_and_copy_device_tree(); 294 unflatten_and_copy_device_tree();
295 early_iounmap(dt, map_len); 295 early_memunmap(dt, map_len);
296} 296}
297#else 297#else
298static inline void x86_flattree_get_config(void) { } 298static inline void x86_flattree_get_config(void) { }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d8d039..9c30acfadae2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -25,10 +25,12 @@ unsigned int code_bytes = 64;
25int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; 25int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
26static int die_counter; 26static int die_counter;
27 27
28static void printk_stack_address(unsigned long address, int reliable) 28static void printk_stack_address(unsigned long address, int reliable,
29 void *data)
29{ 30{
30 pr_cont(" [<%p>] %s%pB\n", 31 printk("%s [<%p>] %s%pB\n",
31 (void *)address, reliable ? "" : "? ", (void *)address); 32 (char *)data, (void *)address, reliable ? "" : "? ",
33 (void *)address);
32} 34}
33 35
34void printk_address(unsigned long address) 36void printk_address(unsigned long address)
@@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name)
155static void print_trace_address(void *data, unsigned long addr, int reliable) 157static void print_trace_address(void *data, unsigned long addr, int reliable)
156{ 158{
157 touch_nmi_watchdog(); 159 touch_nmi_watchdog();
158 printk(data); 160 printk_stack_address(addr, reliable, data);
159 printk_stack_address(addr, reliable);
160} 161}
161 162
162static const struct stacktrace_ops print_trace_ops = { 163static const struct stacktrace_ops print_trace_ops = {
@@ -278,7 +279,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
278 print_modules(); 279 print_modules();
279 show_regs(regs); 280 show_regs(regs);
280#ifdef CONFIG_X86_32 281#ifdef CONFIG_X86_32
281 if (user_mode_vm(regs)) { 282 if (user_mode(regs)) {
282 sp = regs->sp; 283 sp = regs->sp;
283 ss = regs->ss & 0xffff; 284 ss = regs->ss & 0xffff;
284 } else { 285 } else {
@@ -307,7 +308,7 @@ void die(const char *str, struct pt_regs *regs, long err)
307 unsigned long flags = oops_begin(); 308 unsigned long flags = oops_begin();
308 int sig = SIGSEGV; 309 int sig = SIGSEGV;
309 310
310 if (!user_mode_vm(regs)) 311 if (!user_mode(regs))
311 report_bug(regs->ip, regs); 312 report_bug(regs->ip, regs);
312 313
313 if (__die(str, regs, err)) 314 if (__die(str, regs, err))
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd4230c..464ffd69b92e 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
108 for (i = 0; i < kstack_depth_to_print; i++) { 108 for (i = 0; i < kstack_depth_to_print; i++) {
109 if (kstack_end(stack)) 109 if (kstack_end(stack))
110 break; 110 break;
111 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 111 if ((i % STACKSLOTS_PER_LINE) == 0) {
112 pr_cont("\n"); 112 if (i != 0)
113 pr_cont(" %08lx", *stack++); 113 pr_cont("\n");
114 printk("%s %08lx", log_lvl, *stack++);
115 } else
116 pr_cont(" %08lx", *stack++);
114 touch_nmi_watchdog(); 117 touch_nmi_watchdog();
115 } 118 }
116 pr_cont("\n"); 119 pr_cont("\n");
@@ -123,13 +126,13 @@ void show_regs(struct pt_regs *regs)
123 int i; 126 int i;
124 127
125 show_regs_print_info(KERN_EMERG); 128 show_regs_print_info(KERN_EMERG);
126 __show_regs(regs, !user_mode_vm(regs)); 129 __show_regs(regs, !user_mode(regs));
127 130
128 /* 131 /*
129 * When in-kernel, we also print out the stack and code at the 132 * When in-kernel, we also print out the stack and code at the
130 * time of the fault.. 133 * time of the fault..
131 */ 134 */
132 if (!user_mode_vm(regs)) { 135 if (!user_mode(regs)) {
133 unsigned int code_prologue = code_bytes * 43 / 64; 136 unsigned int code_prologue = code_bytes * 43 / 64;
134 unsigned int code_len = code_bytes; 137 unsigned int code_len = code_bytes;
135 unsigned char c; 138 unsigned char c;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index ff86f19b5758..5f1c6266eb30 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -280,12 +280,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
280 pr_cont(" <EOI> "); 280 pr_cont(" <EOI> ");
281 } 281 }
282 } else { 282 } else {
283 if (((long) stack & (THREAD_SIZE-1)) == 0) 283 if (kstack_end(stack))
284 break; 284 break;
285 } 285 }
286 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 286 if ((i % STACKSLOTS_PER_LINE) == 0) {
287 pr_cont("\n"); 287 if (i != 0)
288 pr_cont(" %016lx", *stack++); 288 pr_cont("\n");
289 printk("%s %016lx", log_lvl, *stack++);
290 } else
291 pr_cont(" %016lx", *stack++);
289 touch_nmi_watchdog(); 292 touch_nmi_watchdog();
290 } 293 }
291 preempt_enable(); 294 preempt_enable();
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201deee923..e2ce85db2283 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
149 case E820_UNUSABLE: 149 case E820_UNUSABLE:
150 printk(KERN_CONT "unusable"); 150 printk(KERN_CONT "unusable");
151 break; 151 break;
152 case E820_PRAM:
153 printk(KERN_CONT "persistent (type %u)", type);
154 break;
152 default: 155 default:
153 printk(KERN_CONT "type %u", type); 156 printk(KERN_CONT "type %u", type);
154 break; 157 break;
@@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
343 * continue building up new bios map based on this 346 * continue building up new bios map based on this
344 * information 347 * information
345 */ 348 */
346 if (current_type != last_type) { 349 if (current_type != last_type || current_type == E820_PRAM) {
347 if (last_type != 0) { 350 if (last_type != 0) {
348 new_bios[new_bios_entry].size = 351 new_bios[new_bios_entry].size =
349 change_point[chgidx]->addr - last_addr; 352 change_point[chgidx]->addr - last_addr;
@@ -661,7 +664,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
661 extmap = (struct e820entry *)(sdata->data); 664 extmap = (struct e820entry *)(sdata->data);
662 __append_e820_map(extmap, entries); 665 __append_e820_map(extmap, entries);
663 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 666 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
664 early_iounmap(sdata, data_len); 667 early_memunmap(sdata, data_len);
665 printk(KERN_INFO "e820: extended physical RAM map:\n"); 668 printk(KERN_INFO "e820: extended physical RAM map:\n");
666 e820_print_map("extended"); 669 e820_print_map("extended");
667} 670}
@@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
688 register_nosave_region(pfn, PFN_UP(ei->addr)); 691 register_nosave_region(pfn, PFN_UP(ei->addr));
689 692
690 pfn = PFN_DOWN(ei->addr + ei->size); 693 pfn = PFN_DOWN(ei->addr + ei->size);
694
691 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) 695 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
692 register_nosave_region(PFN_UP(ei->addr), pfn); 696 register_nosave_region(PFN_UP(ei->addr), pfn);
693 697
@@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
748/* 752/*
749 * Find the highest page frame number we have available 753 * Find the highest page frame number we have available
750 */ 754 */
751static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) 755static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
752{ 756{
753 int i; 757 int i;
754 unsigned long last_pfn = 0; 758 unsigned long last_pfn = 0;
@@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
759 unsigned long start_pfn; 763 unsigned long start_pfn;
760 unsigned long end_pfn; 764 unsigned long end_pfn;
761 765
762 if (ei->type != type) 766 /*
767 * Persistent memory is accounted as ram for purposes of
768 * establishing max_pfn and mem_map.
769 */
770 if (ei->type != E820_RAM && ei->type != E820_PRAM)
763 continue; 771 continue;
764 772
765 start_pfn = ei->addr >> PAGE_SHIFT; 773 start_pfn = ei->addr >> PAGE_SHIFT;
@@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
784} 792}
785unsigned long __init e820_end_of_ram_pfn(void) 793unsigned long __init e820_end_of_ram_pfn(void)
786{ 794{
787 return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); 795 return e820_end_pfn(MAX_ARCH_PFN);
788} 796}
789 797
790unsigned long __init e820_end_of_low_ram_pfn(void) 798unsigned long __init e820_end_of_low_ram_pfn(void)
791{ 799{
792 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); 800 return e820_end_pfn(1UL << (32-PAGE_SHIFT));
793} 801}
794 802
795static void early_panic(char *msg) 803static void early_panic(char *msg)
@@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p)
866 } else if (*p == '$') { 874 } else if (*p == '$') {
867 start_at = memparse(p+1, &p); 875 start_at = memparse(p+1, &p);
868 e820_add_region(start_at, mem_size, E820_RESERVED); 876 e820_add_region(start_at, mem_size, E820_RESERVED);
877 } else if (*p == '!') {
878 start_at = memparse(p+1, &p);
879 e820_add_region(start_at, mem_size, E820_PRAM);
869 } else 880 } else
870 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); 881 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
871 882
@@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type)
907 case E820_ACPI: return "ACPI Tables"; 918 case E820_ACPI: return "ACPI Tables";
908 case E820_NVS: return "ACPI Non-volatile Storage"; 919 case E820_NVS: return "ACPI Non-volatile Storage";
909 case E820_UNUSABLE: return "Unusable memory"; 920 case E820_UNUSABLE: return "Unusable memory";
921 case E820_PRAM: return "Persistent RAM";
910 default: return "reserved"; 922 default: return "reserved";
911 } 923 }
912} 924}
@@ -940,7 +952,9 @@ void __init e820_reserve_resources(void)
940 * pci device BAR resource and insert them later in 952 * pci device BAR resource and insert them later in
941 * pcibios_resource_survey() 953 * pcibios_resource_survey()
942 */ 954 */
943 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { 955 if (((e820.map[i].type != E820_RESERVED) &&
956 (e820.map[i].type != E820_PRAM)) ||
957 res->start < (1ULL<<20)) {
944 res->flags |= IORESOURCE_BUSY; 958 res->flags |= IORESOURCE_BUSY;
945 insert_resource(&iomem_resource, res); 959 insert_resource(&iomem_resource, res);
946 } 960 }
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index a62536a1be88..49ff55ef9b26 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
95#define DLL 0 /* Divisor Latch Low */ 95#define DLL 0 /* Divisor Latch Low */
96#define DLH 1 /* Divisor latch High */ 96#define DLH 1 /* Divisor latch High */
97 97
98static void mem32_serial_out(unsigned long addr, int offset, int value)
99{
100 uint32_t *vaddr = (uint32_t *)addr;
101 /* shift implied by pointer type */
102 writel(value, vaddr + offset);
103}
104
105static unsigned int mem32_serial_in(unsigned long addr, int offset)
106{
107 uint32_t *vaddr = (uint32_t *)addr;
108 /* shift implied by pointer type */
109 return readl(vaddr + offset);
110}
111
112static unsigned int io_serial_in(unsigned long addr, int offset) 98static unsigned int io_serial_in(unsigned long addr, int offset)
113{ 99{
114 return inb(addr + offset); 100 return inb(addr + offset);
@@ -205,6 +191,20 @@ static __init void early_serial_init(char *s)
205} 191}
206 192
207#ifdef CONFIG_PCI 193#ifdef CONFIG_PCI
194static void mem32_serial_out(unsigned long addr, int offset, int value)
195{
196 u32 *vaddr = (u32 *)addr;
197 /* shift implied by pointer type */
198 writel(value, vaddr + offset);
199}
200
201static unsigned int mem32_serial_in(unsigned long addr, int offset)
202{
203 u32 *vaddr = (u32 *)addr;
204 /* shift implied by pointer type */
205 return readl(vaddr + offset);
206}
207
208/* 208/*
209 * early_pci_serial_init() 209 * early_pci_serial_init()
210 * 210 *
@@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s)
217 unsigned divisor; 217 unsigned divisor;
218 unsigned long baud = DEFAULT_BAUD; 218 unsigned long baud = DEFAULT_BAUD;
219 u8 bus, slot, func; 219 u8 bus, slot, func;
220 uint32_t classcode, bar0; 220 u32 classcode, bar0;
221 uint16_t cmdreg; 221 u16 cmdreg;
222 char *e; 222 char *e;
223 223
224 224
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5bf3e38..1c309763e321 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,10 +395,13 @@ sysenter_past_esp:
395 /*CFI_REL_OFFSET cs, 0*/ 395 /*CFI_REL_OFFSET cs, 0*/
396 /* 396 /*
397 * Push current_thread_info()->sysenter_return to the stack. 397 * Push current_thread_info()->sysenter_return to the stack.
398 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 398 * A tiny bit of offset fixup is necessary: TI_sysenter_return
399 * pushed above; +8 corresponds to copy_thread's esp0 setting. 399 * is relative to thread_info, which is at the bottom of the
400 * kernel stack page. 4*4 means the 4 words pushed above;
401 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
402 * and THREAD_SIZE takes us to the bottom.
400 */ 403 */
401 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) 404 pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
402 CFI_REL_OFFSET eip, 0 405 CFI_REL_OFFSET eip, 0
403 406
404 pushl_cfi %eax 407 pushl_cfi %eax
@@ -432,7 +435,7 @@ sysenter_after_call:
432 TRACE_IRQS_OFF 435 TRACE_IRQS_OFF
433 movl TI_flags(%ebp), %ecx 436 movl TI_flags(%ebp), %ecx
434 testl $_TIF_ALLWORK_MASK, %ecx 437 testl $_TIF_ALLWORK_MASK, %ecx
435 jne sysexit_audit 438 jnz sysexit_audit
436sysenter_exit: 439sysenter_exit:
437/* if something modifies registers it must also disable sysexit */ 440/* if something modifies registers it must also disable sysexit */
438 movl PT_EIP(%esp), %edx 441 movl PT_EIP(%esp), %edx
@@ -460,7 +463,7 @@ sysenter_audit:
460 463
461sysexit_audit: 464sysexit_audit:
462 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 465 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
463 jne syscall_exit_work 466 jnz syscall_exit_work
464 TRACE_IRQS_ON 467 TRACE_IRQS_ON
465 ENABLE_INTERRUPTS(CLBR_ANY) 468 ENABLE_INTERRUPTS(CLBR_ANY)
466 movl %eax,%edx /* second arg, syscall return value */ 469 movl %eax,%edx /* second arg, syscall return value */
@@ -472,7 +475,7 @@ sysexit_audit:
472 TRACE_IRQS_OFF 475 TRACE_IRQS_OFF
473 movl TI_flags(%ebp), %ecx 476 movl TI_flags(%ebp), %ecx
474 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 477 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
475 jne syscall_exit_work 478 jnz syscall_exit_work
476 movl PT_EAX(%esp),%eax /* reload syscall return value */ 479 movl PT_EAX(%esp),%eax /* reload syscall return value */
477 jmp sysenter_exit 480 jmp sysenter_exit
478#endif 481#endif
@@ -510,7 +513,7 @@ syscall_exit:
510 TRACE_IRQS_OFF 513 TRACE_IRQS_OFF
511 movl TI_flags(%ebp), %ecx 514 movl TI_flags(%ebp), %ecx
512 testl $_TIF_ALLWORK_MASK, %ecx # current->work 515 testl $_TIF_ALLWORK_MASK, %ecx # current->work
513 jne syscall_exit_work 516 jnz syscall_exit_work
514 517
515restore_all: 518restore_all:
516 TRACE_IRQS_IRET 519 TRACE_IRQS_IRET
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and
612#ifdef CONFIG_VM86 615#ifdef CONFIG_VM86
613 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 616 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
614 movl %esp, %eax 617 movl %esp, %eax
615 jne work_notifysig_v86 # returning to kernel-space or 618 jnz work_notifysig_v86 # returning to kernel-space or
616 # vm86-space 619 # vm86-space
6171: 6201:
618#else 621#else
@@ -720,43 +723,22 @@ END(sysenter_badsys)
720.endm 723.endm
721 724
722/* 725/*
723 * Build the entry stubs and pointer table with some assembler magic. 726 * Build the entry stubs with some assembler magic.
724 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 727 * We pack 1 stub into every 8-byte block.
725 * single cache line on all modern x86 implementations.
726 */ 728 */
727.section .init.rodata,"a" 729 .align 8
728ENTRY(interrupt)
729.section .entry.text, "ax"
730 .p2align 5
731 .p2align CONFIG_X86_L1_CACHE_SHIFT
732ENTRY(irq_entries_start) 730ENTRY(irq_entries_start)
733 RING0_INT_FRAME 731 RING0_INT_FRAME
734vector=FIRST_EXTERNAL_VECTOR 732 vector=FIRST_EXTERNAL_VECTOR
735.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 733 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
736 .balign 32 734 pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
737 .rept 7 735 vector=vector+1
738 .if vector < FIRST_SYSTEM_VECTOR 736 jmp common_interrupt
739 .if vector <> FIRST_EXTERNAL_VECTOR
740 CFI_ADJUST_CFA_OFFSET -4 737 CFI_ADJUST_CFA_OFFSET -4
741 .endif 738 .align 8
7421: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ 739 .endr
743 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
744 jmp 2f
745 .endif
746 .previous
747 .long 1b
748 .section .entry.text, "ax"
749vector=vector+1
750 .endif
751 .endr
7522: jmp common_interrupt
753.endr
754END(irq_entries_start) 740END(irq_entries_start)
755 741
756.previous
757END(interrupt)
758.previous
759
760/* 742/*
761 * the CPU automatically disables interrupts when executing an IRQ vector, 743 * the CPU automatically disables interrupts when executing an IRQ vector,
762 * so IRQ-flags tracing has to follow that: 744 * so IRQ-flags tracing has to follow that:
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error)
816 pushl_cfi $0 798 pushl_cfi $0
817#ifdef CONFIG_X86_INVD_BUG 799#ifdef CONFIG_X86_INVD_BUG
818 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 800 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
819661: pushl_cfi $do_general_protection 801 ALTERNATIVE "pushl_cfi $do_general_protection", \
820662: 802 "pushl $do_simd_coprocessor_error", \
821.section .altinstructions,"a" 803 X86_FEATURE_XMM
822 altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
823.previous
824.section .altinstr_replacement,"ax"
825663: pushl $do_simd_coprocessor_error
826664:
827.previous
828#else 804#else
829 pushl_cfi $do_simd_coprocessor_error 805 pushl_cfi $do_simd_coprocessor_error
830#endif 806#endif
@@ -1240,20 +1216,13 @@ error_code:
1240 /*CFI_REL_OFFSET es, 0*/ 1216 /*CFI_REL_OFFSET es, 0*/
1241 pushl_cfi %ds 1217 pushl_cfi %ds
1242 /*CFI_REL_OFFSET ds, 0*/ 1218 /*CFI_REL_OFFSET ds, 0*/
1243 pushl_cfi %eax 1219 pushl_cfi_reg eax
1244 CFI_REL_OFFSET eax, 0 1220 pushl_cfi_reg ebp
1245 pushl_cfi %ebp 1221 pushl_cfi_reg edi
1246 CFI_REL_OFFSET ebp, 0 1222 pushl_cfi_reg esi
1247 pushl_cfi %edi 1223 pushl_cfi_reg edx
1248 CFI_REL_OFFSET edi, 0 1224 pushl_cfi_reg ecx
1249 pushl_cfi %esi 1225 pushl_cfi_reg ebx
1250 CFI_REL_OFFSET esi, 0
1251 pushl_cfi %edx
1252 CFI_REL_OFFSET edx, 0
1253 pushl_cfi %ecx
1254 CFI_REL_OFFSET ecx, 0
1255 pushl_cfi %ebx
1256 CFI_REL_OFFSET ebx, 0
1257 cld 1226 cld
1258 movl $(__KERNEL_PERCPU), %ecx 1227 movl $(__KERNEL_PERCPU), %ecx
1259 movl %ecx, %fs 1228 movl %ecx, %fs
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f0095a76c182..c7b238494b31 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,27 +14,14 @@
14 * NOTE: This code handles signal-recognition, which happens every time 14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call. 15 * after an interrupt and after each system call.
16 * 16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology: 17 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP 18 * - iret frame: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack. 19 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers up to R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 * 20 *
26 * Some macro usage: 21 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better 22 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code. 23 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table. 24 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. 25 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - idtentry - Define exception entry points. 26 * - idtentry - Define exception entry points.
40 */ 27 */
@@ -70,10 +57,6 @@
70 .section .entry.text, "ax" 57 .section .entry.text, "ax"
71 58
72 59
73#ifndef CONFIG_PREEMPT
74#define retint_kernel retint_restore_args
75#endif
76
77#ifdef CONFIG_PARAVIRT 60#ifdef CONFIG_PARAVIRT
78ENTRY(native_usergs_sysret64) 61ENTRY(native_usergs_sysret64)
79 swapgs 62 swapgs
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64)
82#endif /* CONFIG_PARAVIRT */ 65#endif /* CONFIG_PARAVIRT */
83 66
84 67
85.macro TRACE_IRQS_IRETQ offset=ARGOFFSET 68.macro TRACE_IRQS_IRETQ
86#ifdef CONFIG_TRACE_IRQFLAGS 69#ifdef CONFIG_TRACE_IRQFLAGS
87 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 70 bt $9,EFLAGS(%rsp) /* interrupts off? */
88 jnc 1f 71 jnc 1f
89 TRACE_IRQS_ON 72 TRACE_IRQS_ON
901: 731:
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64)
116 call debug_stack_reset 99 call debug_stack_reset
117.endm 100.endm
118 101
119.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET 102.macro TRACE_IRQS_IRETQ_DEBUG
120 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 103 bt $9,EFLAGS(%rsp) /* interrupts off? */
121 jnc 1f 104 jnc 1f
122 TRACE_IRQS_ON_DEBUG 105 TRACE_IRQS_ON_DEBUG
1231: 1061:
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64)
130#endif 113#endif
131 114
132/* 115/*
133 * C code is not supposed to know about undefined top of stack. Every time 116 * empty frame
134 * a C function with an pt_regs argument is called from the SYSCALL based
135 * fast path FIXUP_TOP_OF_STACK is needed.
136 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
137 * manipulation.
138 */
139
140 /* %rsp:at FRAMEEND */
141 .macro FIXUP_TOP_OF_STACK tmp offset=0
142 movq PER_CPU_VAR(old_rsp),\tmp
143 movq \tmp,RSP+\offset(%rsp)
144 movq $__USER_DS,SS+\offset(%rsp)
145 movq $__USER_CS,CS+\offset(%rsp)
146 movq RIP+\offset(%rsp),\tmp /* get rip */
147 movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
148 movq R11+\offset(%rsp),\tmp /* get eflags */
149 movq \tmp,EFLAGS+\offset(%rsp)
150 .endm
151
152 .macro RESTORE_TOP_OF_STACK tmp offset=0
153 movq RSP+\offset(%rsp),\tmp
154 movq \tmp,PER_CPU_VAR(old_rsp)
155 movq EFLAGS+\offset(%rsp),\tmp
156 movq \tmp,R11+\offset(%rsp)
157 .endm
158
159/*
160 * initial frame state for interrupts (and exceptions without error code)
161 */ 117 */
162 .macro EMPTY_FRAME start=1 offset=0 118 .macro EMPTY_FRAME start=1 offset=0
163 .if \start 119 .if \start
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64)
173 * initial frame state for interrupts (and exceptions without error code) 129 * initial frame state for interrupts (and exceptions without error code)
174 */ 130 */
175 .macro INTR_FRAME start=1 offset=0 131 .macro INTR_FRAME start=1 offset=0
176 EMPTY_FRAME \start, SS+8+\offset-RIP 132 EMPTY_FRAME \start, 5*8+\offset
177 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ 133 /*CFI_REL_OFFSET ss, 4*8+\offset*/
178 CFI_REL_OFFSET rsp, RSP+\offset-RIP 134 CFI_REL_OFFSET rsp, 3*8+\offset
179 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ 135 /*CFI_REL_OFFSET rflags, 2*8+\offset*/
180 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ 136 /*CFI_REL_OFFSET cs, 1*8+\offset*/
181 CFI_REL_OFFSET rip, RIP+\offset-RIP 137 CFI_REL_OFFSET rip, 0*8+\offset
182 .endm 138 .endm
183 139
184/* 140/*
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64)
186 * with vector already pushed) 142 * with vector already pushed)
187 */ 143 */
188 .macro XCPT_FRAME start=1 offset=0 144 .macro XCPT_FRAME start=1 offset=0
189 INTR_FRAME \start, RIP+\offset-ORIG_RAX 145 INTR_FRAME \start, 1*8+\offset
190 .endm
191
192/*
193 * frame that enables calling into C.
194 */
195 .macro PARTIAL_FRAME start=1 offset=0
196 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
197 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
198 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
199 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
200 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
201 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
202 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
203 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
204 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
205 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
206 .endm 146 .endm
207 147
208/* 148/*
209 * frame that enables passing a complete pt_regs to a C function. 149 * frame that enables passing a complete pt_regs to a C function.
210 */ 150 */
211 .macro DEFAULT_FRAME start=1 offset=0 151 .macro DEFAULT_FRAME start=1 offset=0
212 PARTIAL_FRAME \start, R11+\offset-R15 152 XCPT_FRAME \start, ORIG_RAX+\offset
153 CFI_REL_OFFSET rdi, RDI+\offset
154 CFI_REL_OFFSET rsi, RSI+\offset
155 CFI_REL_OFFSET rdx, RDX+\offset
156 CFI_REL_OFFSET rcx, RCX+\offset
157 CFI_REL_OFFSET rax, RAX+\offset
158 CFI_REL_OFFSET r8, R8+\offset
159 CFI_REL_OFFSET r9, R9+\offset
160 CFI_REL_OFFSET r10, R10+\offset
161 CFI_REL_OFFSET r11, R11+\offset
213 CFI_REL_OFFSET rbx, RBX+\offset 162 CFI_REL_OFFSET rbx, RBX+\offset
214 CFI_REL_OFFSET rbp, RBP+\offset 163 CFI_REL_OFFSET rbp, RBP+\offset
215 CFI_REL_OFFSET r12, R12+\offset 164 CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64)
218 CFI_REL_OFFSET r15, R15+\offset 167 CFI_REL_OFFSET r15, R15+\offset
219 .endm 168 .endm
220 169
221ENTRY(save_paranoid)
222 XCPT_FRAME 1 RDI+8
223 cld
224 movq %rdi, RDI+8(%rsp)
225 movq %rsi, RSI+8(%rsp)
226 movq_cfi rdx, RDX+8
227 movq_cfi rcx, RCX+8
228 movq_cfi rax, RAX+8
229 movq %r8, R8+8(%rsp)
230 movq %r9, R9+8(%rsp)
231 movq %r10, R10+8(%rsp)
232 movq %r11, R11+8(%rsp)
233 movq_cfi rbx, RBX+8
234 movq %rbp, RBP+8(%rsp)
235 movq %r12, R12+8(%rsp)
236 movq %r13, R13+8(%rsp)
237 movq %r14, R14+8(%rsp)
238 movq %r15, R15+8(%rsp)
239 movl $1,%ebx
240 movl $MSR_GS_BASE,%ecx
241 rdmsr
242 testl %edx,%edx
243 js 1f /* negative -> in kernel */
244 SWAPGS
245 xorl %ebx,%ebx
2461: ret
247 CFI_ENDPROC
248END(save_paranoid)
249
250/* 170/*
251 * A newly forked process directly context switches into this address. 171 * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
252 * 172 *
253 * rdi: prev task we switched from 173 * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
254 */ 174 * then loads new ss, cs, and rip from previously programmed MSRs.
255ENTRY(ret_from_fork) 175 * rflags gets masked by a value from another MSR (so CLD and CLAC
256 DEFAULT_FRAME 176 * are not needed). SYSCALL does not save anything on the stack
257 177 * and does not change rsp.
258 LOCK ; btr $TIF_FORK,TI_flags(%r8)
259
260 pushq_cfi $0x0002
261 popfq_cfi # reset kernel eflags
262
263 call schedule_tail # rdi: 'prev' task parameter
264
265 GET_THREAD_INFO(%rcx)
266
267 RESTORE_REST
268
269 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
270 jz 1f
271
272 /*
273 * By the time we get here, we have no idea whether our pt_regs,
274 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
275 * the slow path, or one of the ia32entry paths.
276 * Use int_ret_from_sys_call to return, since it can safely handle
277 * all of the above.
278 */
279 jmp int_ret_from_sys_call
280
2811:
282 subq $REST_SKIP, %rsp # leave space for volatiles
283 CFI_ADJUST_CFA_OFFSET REST_SKIP
284 movq %rbp, %rdi
285 call *%rbx
286 movl $0, RAX(%rsp)
287 RESTORE_REST
288 jmp int_ret_from_sys_call
289 CFI_ENDPROC
290END(ret_from_fork)
291
292/*
293 * System call entry. Up to 6 arguments in registers are supported.
294 * 178 *
295 * SYSCALL does not save anything on the stack and does not change the 179 * Registers on entry:
296 * stack pointer. However, it does mask the flags register for us, so
297 * CLD and CLAC are not needed.
298 */
299
300/*
301 * Register setup:
302 * rax system call number 180 * rax system call number
181 * rcx return address
182 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
303 * rdi arg0 183 * rdi arg0
304 * rcx return address for syscall/sysret, C arg3
305 * rsi arg1 184 * rsi arg1
306 * rdx arg2 185 * rdx arg2
307 * r10 arg3 (--> moved to rcx for C) 186 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
308 * r8 arg4 187 * r8 arg4
309 * r9 arg5 188 * r9 arg5
310 * r11 eflags for syscall/sysret, temporary for C 189 * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
311 * r12-r15,rbp,rbx saved by C code, not touched.
312 * 190 *
313 * Interrupts are off on entry.
314 * Only called from user space. 191 * Only called from user space.
315 * 192 *
316 * XXX if we had a free scratch register we could save the RSP into the stack frame 193 * When user can change pt_regs->foo always force IRET. That is because
317 * and report it properly in ps. Unfortunately we haven't.
318 *
319 * When user can change the frames always force IRET. That is because
320 * it deals with uncanonical addresses better. SYSRET has trouble 194 * it deals with uncanonical addresses better. SYSRET has trouble
321 * with them due to bugs in both AMD and Intel CPUs. 195 * with them due to bugs in both AMD and Intel CPUs.
322 */ 196 */
@@ -324,9 +198,15 @@ END(ret_from_fork)
324ENTRY(system_call) 198ENTRY(system_call)
325 CFI_STARTPROC simple 199 CFI_STARTPROC simple
326 CFI_SIGNAL_FRAME 200 CFI_SIGNAL_FRAME
327 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 201 CFI_DEF_CFA rsp,0
328 CFI_REGISTER rip,rcx 202 CFI_REGISTER rip,rcx
329 /*CFI_REGISTER rflags,r11*/ 203 /*CFI_REGISTER rflags,r11*/
204
205 /*
206 * Interrupts are off on entry.
207 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
208 * it is too small to ever cause noticeable irq latency.
209 */
330 SWAPGS_UNSAFE_STACK 210 SWAPGS_UNSAFE_STACK
331 /* 211 /*
332 * A hypervisor implementation might want to use a label 212 * A hypervisor implementation might want to use a label
@@ -335,18 +215,38 @@ ENTRY(system_call)
335 */ 215 */
336GLOBAL(system_call_after_swapgs) 216GLOBAL(system_call_after_swapgs)
337 217
338 movq %rsp,PER_CPU_VAR(old_rsp) 218 movq %rsp,PER_CPU_VAR(rsp_scratch)
339 movq PER_CPU_VAR(kernel_stack),%rsp 219 movq PER_CPU_VAR(kernel_stack),%rsp
220
221 /* Construct struct pt_regs on stack */
222 pushq_cfi $__USER_DS /* pt_regs->ss */
223 pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
340 /* 224 /*
341 * No need to follow this irqs off/on section - it's straight 225 * Re-enable interrupts.
342 * and short: 226 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
227 * must execute atomically in the face of possible interrupt-driven
228 * task preemption. We must enable interrupts only after we're done
229 * with using rsp_scratch:
343 */ 230 */
344 ENABLE_INTERRUPTS(CLBR_NONE) 231 ENABLE_INTERRUPTS(CLBR_NONE)
345 SAVE_ARGS 8, 0, rax_enosys=1 232 pushq_cfi %r11 /* pt_regs->flags */
346 movq_cfi rax,(ORIG_RAX-ARGOFFSET) 233 pushq_cfi $__USER_CS /* pt_regs->cs */
347 movq %rcx,RIP-ARGOFFSET(%rsp) 234 pushq_cfi %rcx /* pt_regs->ip */
348 CFI_REL_OFFSET rip,RIP-ARGOFFSET 235 CFI_REL_OFFSET rip,0
349 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 236 pushq_cfi_reg rax /* pt_regs->orig_ax */
237 pushq_cfi_reg rdi /* pt_regs->di */
238 pushq_cfi_reg rsi /* pt_regs->si */
239 pushq_cfi_reg rdx /* pt_regs->dx */
240 pushq_cfi_reg rcx /* pt_regs->cx */
241 pushq_cfi $-ENOSYS /* pt_regs->ax */
242 pushq_cfi_reg r8 /* pt_regs->r8 */
243 pushq_cfi_reg r9 /* pt_regs->r9 */
244 pushq_cfi_reg r10 /* pt_regs->r10 */
245 pushq_cfi_reg r11 /* pt_regs->r11 */
246 sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
247 CFI_ADJUST_CFA_OFFSET 6*8
248
249 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
350 jnz tracesys 250 jnz tracesys
351system_call_fastpath: 251system_call_fastpath:
352#if __SYSCALL_MASK == ~0 252#if __SYSCALL_MASK == ~0
@@ -355,18 +255,21 @@ system_call_fastpath:
355 andl $__SYSCALL_MASK,%eax 255 andl $__SYSCALL_MASK,%eax
356 cmpl $__NR_syscall_max,%eax 256 cmpl $__NR_syscall_max,%eax
357#endif 257#endif
358 ja ret_from_sys_call /* and return regs->ax */ 258 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
359 movq %r10,%rcx 259 movq %r10,%rcx
360 call *sys_call_table(,%rax,8) # XXX: rip relative 260 call *sys_call_table(,%rax,8)
361 movq %rax,RAX-ARGOFFSET(%rsp) 261 movq %rax,RAX(%rsp)
2621:
362/* 263/*
363 * Syscall return path ending with SYSRET (fast path) 264 * Syscall return path ending with SYSRET (fast path).
364 * Has incomplete stack frame and undefined top of stack. 265 * Has incompletely filled pt_regs.
365 */ 266 */
366ret_from_sys_call:
367 LOCKDEP_SYS_EXIT 267 LOCKDEP_SYS_EXIT
268 /*
269 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
270 * it is too small to ever cause noticeable irq latency.
271 */
368 DISABLE_INTERRUPTS(CLBR_NONE) 272 DISABLE_INTERRUPTS(CLBR_NONE)
369 TRACE_IRQS_OFF
370 273
371 /* 274 /*
372 * We must check ti flags with interrupts (or at least preemption) 275 * We must check ti flags with interrupts (or at least preemption)
@@ -376,72 +279,73 @@ ret_from_sys_call:
376 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is 279 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
377 * very bad. 280 * very bad.
378 */ 281 */
379 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 282 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
380 jnz int_ret_from_sys_call_fixup /* Go the the slow path */ 283 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
381 284
382 CFI_REMEMBER_STATE 285 CFI_REMEMBER_STATE
383 /* 286
384 * sysretq will re-enable interrupts: 287 RESTORE_C_REGS_EXCEPT_RCX_R11
385 */ 288 movq RIP(%rsp),%rcx
386 TRACE_IRQS_ON
387 movq RIP-ARGOFFSET(%rsp),%rcx
388 CFI_REGISTER rip,rcx 289 CFI_REGISTER rip,rcx
389 RESTORE_ARGS 1,-ARG_SKIP,0 290 movq EFLAGS(%rsp),%r11
390 /*CFI_REGISTER rflags,r11*/ 291 /*CFI_REGISTER rflags,r11*/
391 movq PER_CPU_VAR(old_rsp), %rsp 292 movq RSP(%rsp),%rsp
293 /*
294 * 64bit SYSRET restores rip from rcx,
295 * rflags from r11 (but RF and VM bits are forced to 0),
296 * cs and ss are loaded from MSRs.
297 * Restoration of rflags re-enables interrupts.
298 */
392 USERGS_SYSRET64 299 USERGS_SYSRET64
393 300
394 CFI_RESTORE_STATE 301 CFI_RESTORE_STATE
395 302
396int_ret_from_sys_call_fixup: 303 /* Do syscall entry tracing */
397 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
398 jmp int_ret_from_sys_call_irqs_off
399
400 /* Do syscall tracing */
401tracesys: 304tracesys:
402 leaq -REST_SKIP(%rsp), %rdi 305 movq %rsp, %rdi
403 movq $AUDIT_ARCH_X86_64, %rsi 306 movl $AUDIT_ARCH_X86_64, %esi
404 call syscall_trace_enter_phase1 307 call syscall_trace_enter_phase1
405 test %rax, %rax 308 test %rax, %rax
406 jnz tracesys_phase2 /* if needed, run the slow path */ 309 jnz tracesys_phase2 /* if needed, run the slow path */
407 LOAD_ARGS 0 /* else restore clobbered regs */ 310 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
311 movq ORIG_RAX(%rsp), %rax
408 jmp system_call_fastpath /* and return to the fast path */ 312 jmp system_call_fastpath /* and return to the fast path */
409 313
410tracesys_phase2: 314tracesys_phase2:
411 SAVE_REST 315 SAVE_EXTRA_REGS
412 FIXUP_TOP_OF_STACK %rdi
413 movq %rsp, %rdi 316 movq %rsp, %rdi
414 movq $AUDIT_ARCH_X86_64, %rsi 317 movl $AUDIT_ARCH_X86_64, %esi
415 movq %rax,%rdx 318 movq %rax,%rdx
416 call syscall_trace_enter_phase2 319 call syscall_trace_enter_phase2
417 320
418 /* 321 /*
419 * Reload arg registers from stack in case ptrace changed them. 322 * Reload registers from stack in case ptrace changed them.
420 * We don't reload %rax because syscall_trace_entry_phase2() returned 323 * We don't reload %rax because syscall_trace_entry_phase2() returned
421 * the value it wants us to use in the table lookup. 324 * the value it wants us to use in the table lookup.
422 */ 325 */
423 LOAD_ARGS ARGOFFSET, 1 326 RESTORE_C_REGS_EXCEPT_RAX
424 RESTORE_REST 327 RESTORE_EXTRA_REGS
425#if __SYSCALL_MASK == ~0 328#if __SYSCALL_MASK == ~0
426 cmpq $__NR_syscall_max,%rax 329 cmpq $__NR_syscall_max,%rax
427#else 330#else
428 andl $__SYSCALL_MASK,%eax 331 andl $__SYSCALL_MASK,%eax
429 cmpl $__NR_syscall_max,%eax 332 cmpl $__NR_syscall_max,%eax
430#endif 333#endif
431 ja int_ret_from_sys_call /* RAX(%rsp) is already set */ 334 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
432 movq %r10,%rcx /* fixup for C */ 335 movq %r10,%rcx /* fixup for C */
433 call *sys_call_table(,%rax,8) 336 call *sys_call_table(,%rax,8)
434 movq %rax,RAX-ARGOFFSET(%rsp) 337 movq %rax,RAX(%rsp)
435 /* Use IRET because user could have changed frame */ 3381:
339 /* Use IRET because user could have changed pt_regs->foo */
436 340
437/* 341/*
438 * Syscall return path ending with IRET. 342 * Syscall return path ending with IRET.
439 * Has correct top of stack, but partial stack frame. 343 * Has correct iret frame.
440 */ 344 */
441GLOBAL(int_ret_from_sys_call) 345GLOBAL(int_ret_from_sys_call)
442 DISABLE_INTERRUPTS(CLBR_NONE) 346 DISABLE_INTERRUPTS(CLBR_NONE)
347int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
443 TRACE_IRQS_OFF 348 TRACE_IRQS_OFF
444int_ret_from_sys_call_irqs_off:
445 movl $_TIF_ALLWORK_MASK,%edi 349 movl $_TIF_ALLWORK_MASK,%edi
446 /* edi: mask to check */ 350 /* edi: mask to check */
447GLOBAL(int_with_check) 351GLOBAL(int_with_check)
@@ -450,8 +354,8 @@ GLOBAL(int_with_check)
450 movl TI_flags(%rcx),%edx 354 movl TI_flags(%rcx),%edx
451 andl %edi,%edx 355 andl %edi,%edx
452 jnz int_careful 356 jnz int_careful
453 andl $~TS_COMPAT,TI_status(%rcx) 357 andl $~TS_COMPAT,TI_status(%rcx)
454 jmp retint_swapgs 358 jmp syscall_return
455 359
456 /* Either reschedule or signal or syscall exit tracking needed. */ 360 /* Either reschedule or signal or syscall exit tracking needed. */
457 /* First do a reschedule test. */ 361 /* First do a reschedule test. */
@@ -468,12 +372,11 @@ int_careful:
468 TRACE_IRQS_OFF 372 TRACE_IRQS_OFF
469 jmp int_with_check 373 jmp int_with_check
470 374
471 /* handle signals and tracing -- both require a full stack frame */ 375 /* handle signals and tracing -- both require a full pt_regs */
472int_very_careful: 376int_very_careful:
473 TRACE_IRQS_ON 377 TRACE_IRQS_ON
474 ENABLE_INTERRUPTS(CLBR_NONE) 378 ENABLE_INTERRUPTS(CLBR_NONE)
475int_check_syscall_exit_work: 379 SAVE_EXTRA_REGS
476 SAVE_REST
477 /* Check for syscall exit trace */ 380 /* Check for syscall exit trace */
478 testl $_TIF_WORK_SYSCALL_EXIT,%edx 381 testl $_TIF_WORK_SYSCALL_EXIT,%edx
479 jz int_signal 382 jz int_signal
@@ -492,86 +395,192 @@ int_signal:
492 call do_notify_resume 395 call do_notify_resume
4931: movl $_TIF_WORK_MASK,%edi 3961: movl $_TIF_WORK_MASK,%edi
494int_restore_rest: 397int_restore_rest:
495 RESTORE_REST 398 RESTORE_EXTRA_REGS
496 DISABLE_INTERRUPTS(CLBR_NONE) 399 DISABLE_INTERRUPTS(CLBR_NONE)
497 TRACE_IRQS_OFF 400 TRACE_IRQS_OFF
498 jmp int_with_check 401 jmp int_with_check
402
403syscall_return:
404 /* The IRETQ could re-enable interrupts: */
405 DISABLE_INTERRUPTS(CLBR_ANY)
406 TRACE_IRQS_IRETQ
407
408 /*
409 * Try to use SYSRET instead of IRET if we're returning to
410 * a completely clean 64-bit userspace context.
411 */
412 movq RCX(%rsp),%rcx
413 cmpq %rcx,RIP(%rsp) /* RCX == RIP */
414 jne opportunistic_sysret_failed
415
416 /*
417 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
418 * in kernel space. This essentially lets the user take over
419 * the kernel, since userspace controls RSP. It's not worth
420 * testing for canonicalness exactly -- this check detects any
421 * of the 17 high bits set, which is true for non-canonical
422 * or kernel addresses. (This will pessimize vsyscall=native.
423 * Big deal.)
424 *
425 * If virtual addresses ever become wider, this will need
426 * to be updated to remain correct on both old and new CPUs.
427 */
428 .ifne __VIRTUAL_MASK_SHIFT - 47
429 .error "virtual address width changed -- SYSRET checks need update"
430 .endif
431 shr $__VIRTUAL_MASK_SHIFT, %rcx
432 jnz opportunistic_sysret_failed
433
434 cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
435 jne opportunistic_sysret_failed
436
437 movq R11(%rsp),%r11
438 cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
439 jne opportunistic_sysret_failed
440
441 /*
442 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
443 * restoring TF results in a trap from userspace immediately after
444 * SYSRET. This would cause an infinite loop whenever #DB happens
445 * with register state that satisfies the opportunistic SYSRET
446 * conditions. For example, single-stepping this user code:
447 *
448 * movq $stuck_here,%rcx
449 * pushfq
450 * popq %r11
451 * stuck_here:
452 *
453 * would never get past 'stuck_here'.
454 */
455 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
456 jnz opportunistic_sysret_failed
457
458 /* nothing to check for RSP */
459
460 cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
461 jne opportunistic_sysret_failed
462
463 /*
464 * We win! This label is here just for ease of understanding
465 * perf profiles. Nothing jumps here.
466 */
467syscall_return_via_sysret:
468 CFI_REMEMBER_STATE
469 /* r11 is already restored (see code above) */
470 RESTORE_C_REGS_EXCEPT_R11
471 movq RSP(%rsp),%rsp
472 USERGS_SYSRET64
473 CFI_RESTORE_STATE
474
475opportunistic_sysret_failed:
476 SWAPGS
477 jmp restore_c_regs_and_iret
499 CFI_ENDPROC 478 CFI_ENDPROC
500END(system_call) 479END(system_call)
501 480
481
502 .macro FORK_LIKE func 482 .macro FORK_LIKE func
503ENTRY(stub_\func) 483ENTRY(stub_\func)
504 CFI_STARTPROC 484 CFI_STARTPROC
505 popq %r11 /* save return address */ 485 DEFAULT_FRAME 0, 8 /* offset 8: return address */
506 PARTIAL_FRAME 0 486 SAVE_EXTRA_REGS 8
507 SAVE_REST 487 jmp sys_\func
508 pushq %r11 /* put it back on stack */
509 FIXUP_TOP_OF_STACK %r11, 8
510 DEFAULT_FRAME 0 8 /* offset 8: return address */
511 call sys_\func
512 RESTORE_TOP_OF_STACK %r11, 8
513 ret $REST_SKIP /* pop extended registers */
514 CFI_ENDPROC 488 CFI_ENDPROC
515END(stub_\func) 489END(stub_\func)
516 .endm 490 .endm
517 491
518 .macro FIXED_FRAME label,func
519ENTRY(\label)
520 CFI_STARTPROC
521 PARTIAL_FRAME 0 8 /* offset 8: return address */
522 FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
523 call \func
524 RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
525 ret
526 CFI_ENDPROC
527END(\label)
528 .endm
529
530 FORK_LIKE clone 492 FORK_LIKE clone
531 FORK_LIKE fork 493 FORK_LIKE fork
532 FORK_LIKE vfork 494 FORK_LIKE vfork
533 FIXED_FRAME stub_iopl, sys_iopl
534 495
535ENTRY(stub_execve) 496ENTRY(stub_execve)
536 CFI_STARTPROC 497 CFI_STARTPROC
537 addq $8, %rsp 498 DEFAULT_FRAME 0, 8
538 PARTIAL_FRAME 0 499 call sys_execve
539 SAVE_REST 500return_from_execve:
540 FIXUP_TOP_OF_STACK %r11 501 testl %eax, %eax
541 call sys_execve 502 jz 1f
542 movq %rax,RAX(%rsp) 503 /* exec failed, can use fast SYSRET code path in this case */
543 RESTORE_REST 504 ret
544 jmp int_ret_from_sys_call 5051:
506 /* must use IRET code path (pt_regs->cs may have changed) */
507 addq $8, %rsp
508 CFI_ADJUST_CFA_OFFSET -8
509 ZERO_EXTRA_REGS
510 movq %rax,RAX(%rsp)
511 jmp int_ret_from_sys_call
545 CFI_ENDPROC 512 CFI_ENDPROC
546END(stub_execve) 513END(stub_execve)
547 514/*
548ENTRY(stub_execveat) 515 * Remaining execve stubs are only 7 bytes long.
516 * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
517 */
518 .align 8
519GLOBAL(stub_execveat)
549 CFI_STARTPROC 520 CFI_STARTPROC
550 addq $8, %rsp 521 DEFAULT_FRAME 0, 8
551 PARTIAL_FRAME 0 522 call sys_execveat
552 SAVE_REST 523 jmp return_from_execve
553 FIXUP_TOP_OF_STACK %r11
554 call sys_execveat
555 RESTORE_TOP_OF_STACK %r11
556 movq %rax,RAX(%rsp)
557 RESTORE_REST
558 jmp int_ret_from_sys_call
559 CFI_ENDPROC 524 CFI_ENDPROC
560END(stub_execveat) 525END(stub_execveat)
561 526
527#ifdef CONFIG_X86_X32_ABI
528 .align 8
529GLOBAL(stub_x32_execve)
530 CFI_STARTPROC
531 DEFAULT_FRAME 0, 8
532 call compat_sys_execve
533 jmp return_from_execve
534 CFI_ENDPROC
535END(stub_x32_execve)
536 .align 8
537GLOBAL(stub_x32_execveat)
538 CFI_STARTPROC
539 DEFAULT_FRAME 0, 8
540 call compat_sys_execveat
541 jmp return_from_execve
542 CFI_ENDPROC
543END(stub_x32_execveat)
544#endif
545
546#ifdef CONFIG_IA32_EMULATION
547 .align 8
548GLOBAL(stub32_execve)
549 CFI_STARTPROC
550 call compat_sys_execve
551 jmp return_from_execve
552 CFI_ENDPROC
553END(stub32_execve)
554 .align 8
555GLOBAL(stub32_execveat)
556 CFI_STARTPROC
557 call compat_sys_execveat
558 jmp return_from_execve
559 CFI_ENDPROC
560END(stub32_execveat)
561#endif
562
562/* 563/*
563 * sigreturn is special because it needs to restore all registers on return. 564 * sigreturn is special because it needs to restore all registers on return.
564 * This cannot be done with SYSRET, so use the IRET return path instead. 565 * This cannot be done with SYSRET, so use the IRET return path instead.
565 */ 566 */
566ENTRY(stub_rt_sigreturn) 567ENTRY(stub_rt_sigreturn)
567 CFI_STARTPROC 568 CFI_STARTPROC
568 addq $8, %rsp 569 DEFAULT_FRAME 0, 8
569 PARTIAL_FRAME 0 570 /*
570 SAVE_REST 571 * SAVE_EXTRA_REGS result is not normally needed:
571 FIXUP_TOP_OF_STACK %r11 572 * sigreturn overwrites all pt_regs->GPREGS.
573 * But sigreturn can fail (!), and there is no easy way to detect that.
574 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
575 * we SAVE_EXTRA_REGS here.
576 */
577 SAVE_EXTRA_REGS 8
572 call sys_rt_sigreturn 578 call sys_rt_sigreturn
573 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 579return_from_stub:
574 RESTORE_REST 580 addq $8, %rsp
581 CFI_ADJUST_CFA_OFFSET -8
582 RESTORE_EXTRA_REGS
583 movq %rax,RAX(%rsp)
575 jmp int_ret_from_sys_call 584 jmp int_ret_from_sys_call
576 CFI_ENDPROC 585 CFI_ENDPROC
577END(stub_rt_sigreturn) 586END(stub_rt_sigreturn)
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn)
579#ifdef CONFIG_X86_X32_ABI 588#ifdef CONFIG_X86_X32_ABI
580ENTRY(stub_x32_rt_sigreturn) 589ENTRY(stub_x32_rt_sigreturn)
581 CFI_STARTPROC 590 CFI_STARTPROC
582 addq $8, %rsp 591 DEFAULT_FRAME 0, 8
583 PARTIAL_FRAME 0 592 SAVE_EXTRA_REGS 8
584 SAVE_REST
585 FIXUP_TOP_OF_STACK %r11
586 call sys32_x32_rt_sigreturn 593 call sys32_x32_rt_sigreturn
587 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 594 jmp return_from_stub
588 RESTORE_REST
589 jmp int_ret_from_sys_call
590 CFI_ENDPROC 595 CFI_ENDPROC
591END(stub_x32_rt_sigreturn) 596END(stub_x32_rt_sigreturn)
597#endif
592 598
593ENTRY(stub_x32_execve) 599/*
594 CFI_STARTPROC 600 * A newly forked process directly context switches into this address.
595 addq $8, %rsp 601 *
596 PARTIAL_FRAME 0 602 * rdi: prev task we switched from
597 SAVE_REST 603 */
598 FIXUP_TOP_OF_STACK %r11 604ENTRY(ret_from_fork)
599 call compat_sys_execve 605 DEFAULT_FRAME
600 RESTORE_TOP_OF_STACK %r11
601 movq %rax,RAX(%rsp)
602 RESTORE_REST
603 jmp int_ret_from_sys_call
604 CFI_ENDPROC
605END(stub_x32_execve)
606 606
607ENTRY(stub_x32_execveat) 607 LOCK ; btr $TIF_FORK,TI_flags(%r8)
608 CFI_STARTPROC 608
609 addq $8, %rsp 609 pushq_cfi $0x0002
610 PARTIAL_FRAME 0 610 popfq_cfi # reset kernel eflags
611 SAVE_REST 611
612 FIXUP_TOP_OF_STACK %r11 612 call schedule_tail # rdi: 'prev' task parameter
613 call compat_sys_execveat 613
614 RESTORE_TOP_OF_STACK %r11 614 RESTORE_EXTRA_REGS
615 movq %rax,RAX(%rsp) 615
616 RESTORE_REST 616 testl $3,CS(%rsp) # from kernel_thread?
617
618 /*
619 * By the time we get here, we have no idea whether our pt_regs,
620 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
621 * the slow path, or one of the ia32entry paths.
622 * Use IRET code path to return, since it can safely handle
623 * all of the above.
624 */
625 jnz int_ret_from_sys_call
626
627 /* We came from kernel_thread */
628 /* nb: we depend on RESTORE_EXTRA_REGS above */
629 movq %rbp, %rdi
630 call *%rbx
631 movl $0, RAX(%rsp)
632 RESTORE_EXTRA_REGS
617 jmp int_ret_from_sys_call 633 jmp int_ret_from_sys_call
618 CFI_ENDPROC 634 CFI_ENDPROC
619END(stub_x32_execveat) 635END(ret_from_fork)
620
621#endif
622 636
623/* 637/*
624 * Build the entry stubs and pointer table with some assembler magic. 638 * Build the entry stubs with some assembler magic.
625 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 639 * We pack 1 stub into every 8-byte block.
626 * single cache line on all modern x86 implementations.
627 */ 640 */
628 .section .init.rodata,"a" 641 .align 8
629ENTRY(interrupt)
630 .section .entry.text
631 .p2align 5
632 .p2align CONFIG_X86_L1_CACHE_SHIFT
633ENTRY(irq_entries_start) 642ENTRY(irq_entries_start)
634 INTR_FRAME 643 INTR_FRAME
635vector=FIRST_EXTERNAL_VECTOR 644 vector=FIRST_EXTERNAL_VECTOR
636.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 645 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
637 .balign 32 646 pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
638 .rept 7 647 vector=vector+1
639 .if vector < FIRST_SYSTEM_VECTOR 648 jmp common_interrupt
640 .if vector <> FIRST_EXTERNAL_VECTOR
641 CFI_ADJUST_CFA_OFFSET -8 649 CFI_ADJUST_CFA_OFFSET -8
642 .endif 650 .align 8
6431: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ 651 .endr
644 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
645 jmp 2f
646 .endif
647 .previous
648 .quad 1b
649 .section .entry.text
650vector=vector+1
651 .endif
652 .endr
6532: jmp common_interrupt
654.endr
655 CFI_ENDPROC 652 CFI_ENDPROC
656END(irq_entries_start) 653END(irq_entries_start)
657 654
658.previous
659END(interrupt)
660.previous
661
662/* 655/*
663 * Interrupt entry/exit. 656 * Interrupt entry/exit.
664 * 657 *
@@ -669,47 +662,45 @@ END(interrupt)
669 662
670/* 0(%rsp): ~(interrupt number) */ 663/* 0(%rsp): ~(interrupt number) */
671 .macro interrupt func 664 .macro interrupt func
672 /* reserve pt_regs for scratch regs and rbp */
673 subq $ORIG_RAX-RBP, %rsp
674 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
675 cld 665 cld
676 /* start from rbp in pt_regs and jump over */ 666 /*
677 movq_cfi rdi, (RDI-RBP) 667 * Since nothing in interrupt handling code touches r12...r15 members
678 movq_cfi rsi, (RSI-RBP) 668 * of "struct pt_regs", and since interrupts can nest, we can save
679 movq_cfi rdx, (RDX-RBP) 669 * four stack slots and simultaneously provide
680 movq_cfi rcx, (RCX-RBP) 670 * an unwind-friendly stack layout by saving "truncated" pt_regs
681 movq_cfi rax, (RAX-RBP) 671 * exactly up to rbp slot, without these members.
682 movq_cfi r8, (R8-RBP) 672 */
683 movq_cfi r9, (R9-RBP) 673 ALLOC_PT_GPREGS_ON_STACK -RBP
684 movq_cfi r10, (R10-RBP) 674 SAVE_C_REGS -RBP
685 movq_cfi r11, (R11-RBP) 675 /* this goes to 0(%rsp) for unwinder, not for saving the value: */
686 676 SAVE_EXTRA_REGS_RBP -RBP
687 /* Save rbp so that we can unwind from get_irq_regs() */
688 movq_cfi rbp, 0
689
690 /* Save previous stack value */
691 movq %rsp, %rsi
692 677
693 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 678 leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
694 testl $3, CS-RBP(%rsi) 679
680 testl $3, CS-RBP(%rsp)
695 je 1f 681 je 1f
696 SWAPGS 682 SWAPGS
6831:
697 /* 684 /*
685 * Save previous stack pointer, optionally switch to interrupt stack.
698 * irq_count is used to check if a CPU is already on an interrupt stack 686 * irq_count is used to check if a CPU is already on an interrupt stack
699 * or not. While this is essentially redundant with preempt_count it is 687 * or not. While this is essentially redundant with preempt_count it is
700 * a little cheaper to use a separate counter in the PDA (short of 688 * a little cheaper to use a separate counter in the PDA (short of
701 * moving irq_enter into assembly, which would be too much work) 689 * moving irq_enter into assembly, which would be too much work)
702 */ 690 */
7031: incl PER_CPU_VAR(irq_count) 691 movq %rsp, %rsi
692 incl PER_CPU_VAR(irq_count)
704 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 693 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
705 CFI_DEF_CFA_REGISTER rsi 694 CFI_DEF_CFA_REGISTER rsi
706
707 /* Store previous stack value */
708 pushq %rsi 695 pushq %rsi
696 /*
697 * For debugger:
698 * "CFA (Current Frame Address) is the value on stack + offset"
699 */
709 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 700 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
710 0x77 /* DW_OP_breg7 */, 0, \ 701 0x77 /* DW_OP_breg7 (rsp) */, 0, \
711 0x06 /* DW_OP_deref */, \ 702 0x06 /* DW_OP_deref */, \
712 0x08 /* DW_OP_const1u */, SS+8-RBP, \ 703 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
713 0x22 /* DW_OP_plus */ 704 0x22 /* DW_OP_plus */
714 /* We entered an interrupt context - irqs are off: */ 705 /* We entered an interrupt context - irqs are off: */
715 TRACE_IRQS_OFF 706 TRACE_IRQS_OFF
@@ -727,7 +718,7 @@ common_interrupt:
727 ASM_CLAC 718 ASM_CLAC
728 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 719 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
729 interrupt do_IRQ 720 interrupt do_IRQ
730 /* 0(%rsp): old_rsp-ARGOFFSET */ 721 /* 0(%rsp): old RSP */
731ret_from_intr: 722ret_from_intr:
732 DISABLE_INTERRUPTS(CLBR_NONE) 723 DISABLE_INTERRUPTS(CLBR_NONE)
733 TRACE_IRQS_OFF 724 TRACE_IRQS_OFF
@@ -735,19 +726,18 @@ ret_from_intr:
735 726
736 /* Restore saved previous stack */ 727 /* Restore saved previous stack */
737 popq %rsi 728 popq %rsi
738 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ 729 CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
739 leaq ARGOFFSET-RBP(%rsi), %rsp 730 /* return code expects complete pt_regs - adjust rsp accordingly: */
731 leaq -RBP(%rsi),%rsp
740 CFI_DEF_CFA_REGISTER rsp 732 CFI_DEF_CFA_REGISTER rsp
741 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 733 CFI_ADJUST_CFA_OFFSET RBP
742 734
743exit_intr: 735 testl $3,CS(%rsp)
744 GET_THREAD_INFO(%rcx)
745 testl $3,CS-ARGOFFSET(%rsp)
746 je retint_kernel 736 je retint_kernel
747
748 /* Interrupt came from user space */ 737 /* Interrupt came from user space */
738
739 GET_THREAD_INFO(%rcx)
749 /* 740 /*
750 * Has a correct top of stack, but a partial stack frame
751 * %rcx: thread info. Interrupts off. 741 * %rcx: thread info. Interrupts off.
752 */ 742 */
753retint_with_reschedule: 743retint_with_reschedule:
@@ -766,84 +756,34 @@ retint_swapgs: /* return to user-space */
766 DISABLE_INTERRUPTS(CLBR_ANY) 756 DISABLE_INTERRUPTS(CLBR_ANY)
767 TRACE_IRQS_IRETQ 757 TRACE_IRQS_IRETQ
768 758
769 /*
770 * Try to use SYSRET instead of IRET if we're returning to
771 * a completely clean 64-bit userspace context.
772 */
773 movq (RCX-R11)(%rsp), %rcx
774 cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
775 jne opportunistic_sysret_failed
776
777 /*
778 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
779 * in kernel space. This essentially lets the user take over
780 * the kernel, since userspace controls RSP. It's not worth
781 * testing for canonicalness exactly -- this check detects any
782 * of the 17 high bits set, which is true for non-canonical
783 * or kernel addresses. (This will pessimize vsyscall=native.
784 * Big deal.)
785 *
786 * If virtual addresses ever become wider, this will need
787 * to be updated to remain correct on both old and new CPUs.
788 */
789 .ifne __VIRTUAL_MASK_SHIFT - 47
790 .error "virtual address width changed -- sysret checks need update"
791 .endif
792 shr $__VIRTUAL_MASK_SHIFT, %rcx
793 jnz opportunistic_sysret_failed
794
795 cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
796 jne opportunistic_sysret_failed
797
798 movq (R11-ARGOFFSET)(%rsp), %r11
799 cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
800 jne opportunistic_sysret_failed
801
802 /*
803 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
804 * restoring TF results in a trap from userspace immediately after
805 * SYSRET. This would cause an infinite loop whenever #DB happens
806 * with register state that satisfies the opportunistic SYSRET
807 * conditions. For example, single-stepping this user code:
808 *
809 * movq $stuck_here,%rcx
810 * pushfq
811 * popq %r11
812 * stuck_here:
813 *
814 * would never get past 'stuck_here'.
815 */
816 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
817 jnz opportunistic_sysret_failed
818
819 /* nothing to check for RSP */
820
821 cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
822 jne opportunistic_sysret_failed
823
824 /*
825 * We win! This label is here just for ease of understanding
826 * perf profiles. Nothing jumps here.
827 */
828irq_return_via_sysret:
829 CFI_REMEMBER_STATE
830 RESTORE_ARGS 1,8,1
831 movq (RSP-RIP)(%rsp),%rsp
832 USERGS_SYSRET64
833 CFI_RESTORE_STATE
834
835opportunistic_sysret_failed:
836 SWAPGS 759 SWAPGS
837 jmp restore_args 760 jmp restore_c_regs_and_iret
838 761
839retint_restore_args: /* return to kernel space */ 762/* Returning to kernel space */
840 DISABLE_INTERRUPTS(CLBR_ANY) 763retint_kernel:
764#ifdef CONFIG_PREEMPT
765 /* Interrupts are off */
766 /* Check if we need preemption */
767 bt $9,EFLAGS(%rsp) /* interrupts were off? */
768 jnc 1f
7690: cmpl $0,PER_CPU_VAR(__preempt_count)
770 jnz 1f
771 call preempt_schedule_irq
772 jmp 0b
7731:
774#endif
841 /* 775 /*
842 * The iretq could re-enable interrupts: 776 * The iretq could re-enable interrupts:
843 */ 777 */
844 TRACE_IRQS_IRETQ 778 TRACE_IRQS_IRETQ
845restore_args: 779
846 RESTORE_ARGS 1,8,1 780/*
781 * At this label, code paths which return to kernel and to user,
782 * which come from interrupts/exception and from syscalls, merge.
783 */
784restore_c_regs_and_iret:
785 RESTORE_C_REGS
786 REMOVE_PT_GPREGS_FROM_STACK 8
847 787
848irq_return: 788irq_return:
849 INTERRUPT_RETURN 789 INTERRUPT_RETURN
@@ -914,28 +854,17 @@ retint_signal:
914 jz retint_swapgs 854 jz retint_swapgs
915 TRACE_IRQS_ON 855 TRACE_IRQS_ON
916 ENABLE_INTERRUPTS(CLBR_NONE) 856 ENABLE_INTERRUPTS(CLBR_NONE)
917 SAVE_REST 857 SAVE_EXTRA_REGS
918 movq $-1,ORIG_RAX(%rsp) 858 movq $-1,ORIG_RAX(%rsp)
919 xorl %esi,%esi # oldset 859 xorl %esi,%esi # oldset
920 movq %rsp,%rdi # &pt_regs 860 movq %rsp,%rdi # &pt_regs
921 call do_notify_resume 861 call do_notify_resume
922 RESTORE_REST 862 RESTORE_EXTRA_REGS
923 DISABLE_INTERRUPTS(CLBR_NONE) 863 DISABLE_INTERRUPTS(CLBR_NONE)
924 TRACE_IRQS_OFF 864 TRACE_IRQS_OFF
925 GET_THREAD_INFO(%rcx) 865 GET_THREAD_INFO(%rcx)
926 jmp retint_with_reschedule 866 jmp retint_with_reschedule
927 867
928#ifdef CONFIG_PREEMPT
929 /* Returning to kernel space. Check if we need preemption */
930 /* rcx: threadinfo. interrupts off. */
931ENTRY(retint_kernel)
932 cmpl $0,PER_CPU_VAR(__preempt_count)
933 jnz retint_restore_args
934 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
935 jnc retint_restore_args
936 call preempt_schedule_irq
937 jmp exit_intr
938#endif
939 CFI_ENDPROC 868 CFI_ENDPROC
940END(common_interrupt) 869END(common_interrupt)
941 870
@@ -1024,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \
1024/* 953/*
1025 * Exception entry points. 954 * Exception entry points.
1026 */ 955 */
1027#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 956#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
1028 957
1029.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 958.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
1030ENTRY(\sym) 959ENTRY(\sym)
@@ -1046,8 +975,7 @@ ENTRY(\sym)
1046 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 975 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1047 .endif 976 .endif
1048 977
1049 subq $ORIG_RAX-R15, %rsp 978 ALLOC_PT_GPREGS_ON_STACK
1050 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1051 979
1052 .if \paranoid 980 .if \paranoid
1053 .if \paranoid == 1 981 .if \paranoid == 1
@@ -1055,10 +983,11 @@ ENTRY(\sym)
1055 testl $3, CS(%rsp) /* If coming from userspace, switch */ 983 testl $3, CS(%rsp) /* If coming from userspace, switch */
1056 jnz 1f /* stacks. */ 984 jnz 1f /* stacks. */
1057 .endif 985 .endif
1058 call save_paranoid 986 call paranoid_entry
1059 .else 987 .else
1060 call error_entry 988 call error_entry
1061 .endif 989 .endif
990 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
1062 991
1063 DEFAULT_FRAME 0 992 DEFAULT_FRAME 0
1064 993
@@ -1080,19 +1009,20 @@ ENTRY(\sym)
1080 .endif 1009 .endif
1081 1010
1082 .if \shift_ist != -1 1011 .if \shift_ist != -1
1083 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) 1012 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1084 .endif 1013 .endif
1085 1014
1086 call \do_sym 1015 call \do_sym
1087 1016
1088 .if \shift_ist != -1 1017 .if \shift_ist != -1
1089 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) 1018 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1090 .endif 1019 .endif
1091 1020
1021 /* these procedures expect "no swapgs" flag in ebx */
1092 .if \paranoid 1022 .if \paranoid
1093 jmp paranoid_exit /* %ebx: no swapgs flag */ 1023 jmp paranoid_exit
1094 .else 1024 .else
1095 jmp error_exit /* %ebx: no swapgs flag */ 1025 jmp error_exit
1096 .endif 1026 .endif
1097 1027
1098 .if \paranoid == 1 1028 .if \paranoid == 1
@@ -1296,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
1296 addq $0x30,%rsp 1226 addq $0x30,%rsp
1297 CFI_ADJUST_CFA_OFFSET -0x30 1227 CFI_ADJUST_CFA_OFFSET -0x30
1298 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ 1228 pushq_cfi $-1 /* orig_ax = -1 => not a system call */
1299 SAVE_ALL 1229 ALLOC_PT_GPREGS_ON_STACK
1230 SAVE_C_REGS
1231 SAVE_EXTRA_REGS
1300 jmp error_exit 1232 jmp error_exit
1301 CFI_ENDPROC 1233 CFI_ENDPROC
1302END(xen_failsafe_callback) 1234END(xen_failsafe_callback)
@@ -1328,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
1328idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) 1260idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
1329#endif 1261#endif
1330 1262
1331 /* 1263/*
1332 * "Paranoid" exit path from exception stack. This is invoked 1264 * Save all registers in pt_regs, and switch gs if needed.
1333 * only on return from non-NMI IST interrupts that came 1265 * Use slow, but surefire "are we in kernel?" check.
1334 * from kernel space. 1266 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1335 * 1267 */
1336 * We may be returning to very strange contexts (e.g. very early 1268ENTRY(paranoid_entry)
1337 * in syscall entry), so checking for preemption here would 1269 XCPT_FRAME 1 15*8
1338 * be complicated. Fortunately, we there's no good reason 1270 cld
1339 * to try to handle preemption here. 1271 SAVE_C_REGS 8
1340 */ 1272 SAVE_EXTRA_REGS 8
1273 movl $1,%ebx
1274 movl $MSR_GS_BASE,%ecx
1275 rdmsr
1276 testl %edx,%edx
1277 js 1f /* negative -> in kernel */
1278 SWAPGS
1279 xorl %ebx,%ebx
12801: ret
1281 CFI_ENDPROC
1282END(paranoid_entry)
1341 1283
1342 /* ebx: no swapgs flag */ 1284/*
1285 * "Paranoid" exit path from exception stack. This is invoked
1286 * only on return from non-NMI IST interrupts that came
1287 * from kernel space.
1288 *
1289 * We may be returning to very strange contexts (e.g. very early
1290 * in syscall entry), so checking for preemption here would
1291 * be complicated. Fortunately, we there's no good reason
1292 * to try to handle preemption here.
1293 */
1294/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
1343ENTRY(paranoid_exit) 1295ENTRY(paranoid_exit)
1344 DEFAULT_FRAME 1296 DEFAULT_FRAME
1345 DISABLE_INTERRUPTS(CLBR_NONE) 1297 DISABLE_INTERRUPTS(CLBR_NONE)
1346 TRACE_IRQS_OFF_DEBUG 1298 TRACE_IRQS_OFF_DEBUG
1347 testl %ebx,%ebx /* swapgs needed? */ 1299 testl %ebx,%ebx /* swapgs needed? */
1348 jnz paranoid_restore 1300 jnz paranoid_exit_no_swapgs
1349 TRACE_IRQS_IRETQ 0 1301 TRACE_IRQS_IRETQ
1350 SWAPGS_UNSAFE_STACK 1302 SWAPGS_UNSAFE_STACK
1351 RESTORE_ALL 8 1303 jmp paranoid_exit_restore
1352 INTERRUPT_RETURN 1304paranoid_exit_no_swapgs:
1353paranoid_restore: 1305 TRACE_IRQS_IRETQ_DEBUG
1354 TRACE_IRQS_IRETQ_DEBUG 0 1306paranoid_exit_restore:
1355 RESTORE_ALL 8 1307 RESTORE_EXTRA_REGS
1308 RESTORE_C_REGS
1309 REMOVE_PT_GPREGS_FROM_STACK 8
1356 INTERRUPT_RETURN 1310 INTERRUPT_RETURN
1357 CFI_ENDPROC 1311 CFI_ENDPROC
1358END(paranoid_exit) 1312END(paranoid_exit)
1359 1313
1360/* 1314/*
1361 * Exception entry point. This expects an error code/orig_rax on the stack. 1315 * Save all registers in pt_regs, and switch gs if needed.
1362 * returns in "no swapgs flag" in %ebx. 1316 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1363 */ 1317 */
1364ENTRY(error_entry) 1318ENTRY(error_entry)
1365 XCPT_FRAME 1319 XCPT_FRAME 1 15*8
1366 CFI_ADJUST_CFA_OFFSET 15*8
1367 /* oldrax contains error code */
1368 cld 1320 cld
1369 movq %rdi, RDI+8(%rsp) 1321 SAVE_C_REGS 8
1370 movq %rsi, RSI+8(%rsp) 1322 SAVE_EXTRA_REGS 8
1371 movq %rdx, RDX+8(%rsp)
1372 movq %rcx, RCX+8(%rsp)
1373 movq %rax, RAX+8(%rsp)
1374 movq %r8, R8+8(%rsp)
1375 movq %r9, R9+8(%rsp)
1376 movq %r10, R10+8(%rsp)
1377 movq %r11, R11+8(%rsp)
1378 movq_cfi rbx, RBX+8
1379 movq %rbp, RBP+8(%rsp)
1380 movq %r12, R12+8(%rsp)
1381 movq %r13, R13+8(%rsp)
1382 movq %r14, R14+8(%rsp)
1383 movq %r15, R15+8(%rsp)
1384 xorl %ebx,%ebx 1323 xorl %ebx,%ebx
1385 testl $3,CS+8(%rsp) 1324 testl $3,CS+8(%rsp)
1386 je error_kernelspace 1325 je error_kernelspace
@@ -1390,12 +1329,12 @@ error_sti:
1390 TRACE_IRQS_OFF 1329 TRACE_IRQS_OFF
1391 ret 1330 ret
1392 1331
1393/* 1332 /*
1394 * There are two places in the kernel that can potentially fault with 1333 * There are two places in the kernel that can potentially fault with
1395 * usergs. Handle them here. B stepping K8s sometimes report a 1334 * usergs. Handle them here. B stepping K8s sometimes report a
1396 * truncated RIP for IRET exceptions returning to compat mode. Check 1335 * truncated RIP for IRET exceptions returning to compat mode. Check
1397 * for these here too. 1336 * for these here too.
1398 */ 1337 */
1399error_kernelspace: 1338error_kernelspace:
1400 CFI_REL_OFFSET rcx, RCX+8 1339 CFI_REL_OFFSET rcx, RCX+8
1401 incl %ebx 1340 incl %ebx
@@ -1425,11 +1364,11 @@ error_bad_iret:
1425END(error_entry) 1364END(error_entry)
1426 1365
1427 1366
1428/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ 1367/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
1429ENTRY(error_exit) 1368ENTRY(error_exit)
1430 DEFAULT_FRAME 1369 DEFAULT_FRAME
1431 movl %ebx,%eax 1370 movl %ebx,%eax
1432 RESTORE_REST 1371 RESTORE_EXTRA_REGS
1433 DISABLE_INTERRUPTS(CLBR_NONE) 1372 DISABLE_INTERRUPTS(CLBR_NONE)
1434 TRACE_IRQS_OFF 1373 TRACE_IRQS_OFF
1435 GET_THREAD_INFO(%rcx) 1374 GET_THREAD_INFO(%rcx)
@@ -1444,19 +1383,7 @@ ENTRY(error_exit)
1444 CFI_ENDPROC 1383 CFI_ENDPROC
1445END(error_exit) 1384END(error_exit)
1446 1385
1447/* 1386/* Runs on exception stack */
1448 * Test if a given stack is an NMI stack or not.
1449 */
1450 .macro test_in_nmi reg stack nmi_ret normal_ret
1451 cmpq %\reg, \stack
1452 ja \normal_ret
1453 subq $EXCEPTION_STKSZ, %\reg
1454 cmpq %\reg, \stack
1455 jb \normal_ret
1456 jmp \nmi_ret
1457 .endm
1458
1459 /* runs on exception stack */
1460ENTRY(nmi) 1387ENTRY(nmi)
1461 INTR_FRAME 1388 INTR_FRAME
1462 PARAVIRT_ADJUST_EXCEPTION_FRAME 1389 PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1492,7 +1419,7 @@ ENTRY(nmi)
1492 * NMI. 1419 * NMI.
1493 */ 1420 */
1494 1421
1495 /* Use %rdx as out temp variable throughout */ 1422 /* Use %rdx as our temp variable throughout */
1496 pushq_cfi %rdx 1423 pushq_cfi %rdx
1497 CFI_REL_OFFSET rdx, 0 1424 CFI_REL_OFFSET rdx, 0
1498 1425
@@ -1517,8 +1444,17 @@ ENTRY(nmi)
1517 * We check the variable because the first NMI could be in a 1444 * We check the variable because the first NMI could be in a
1518 * breakpoint routine using a breakpoint stack. 1445 * breakpoint routine using a breakpoint stack.
1519 */ 1446 */
1520 lea 6*8(%rsp), %rdx 1447 lea 6*8(%rsp), %rdx
1521 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi 1448 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1449 cmpq %rdx, 4*8(%rsp)
1450 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1451 ja first_nmi
1452 subq $EXCEPTION_STKSZ, %rdx
1453 cmpq %rdx, 4*8(%rsp)
1454 /* If it is below the NMI stack, it is a normal NMI */
1455 jb first_nmi
1456 /* Ah, it is within the NMI stack, treat it as nested */
1457
1522 CFI_REMEMBER_STATE 1458 CFI_REMEMBER_STATE
1523 1459
1524nested_nmi: 1460nested_nmi:
@@ -1611,7 +1547,7 @@ first_nmi:
1611 .rept 5 1547 .rept 5
1612 pushq_cfi 11*8(%rsp) 1548 pushq_cfi 11*8(%rsp)
1613 .endr 1549 .endr
1614 CFI_DEF_CFA_OFFSET SS+8-RIP 1550 CFI_DEF_CFA_OFFSET 5*8
1615 1551
1616 /* Everything up to here is safe from nested NMIs */ 1552 /* Everything up to here is safe from nested NMIs */
1617 1553
@@ -1639,7 +1575,7 @@ repeat_nmi:
1639 pushq_cfi -6*8(%rsp) 1575 pushq_cfi -6*8(%rsp)
1640 .endr 1576 .endr
1641 subq $(5*8), %rsp 1577 subq $(5*8), %rsp
1642 CFI_DEF_CFA_OFFSET SS+8-RIP 1578 CFI_DEF_CFA_OFFSET 5*8
1643end_repeat_nmi: 1579end_repeat_nmi:
1644 1580
1645 /* 1581 /*
@@ -1648,16 +1584,16 @@ end_repeat_nmi:
1648 * so that we repeat another NMI. 1584 * so that we repeat another NMI.
1649 */ 1585 */
1650 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1586 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1651 subq $ORIG_RAX-R15, %rsp 1587 ALLOC_PT_GPREGS_ON_STACK
1652 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1588
1653 /* 1589 /*
1654 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit 1590 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1655 * as we should not be calling schedule in NMI context. 1591 * as we should not be calling schedule in NMI context.
1656 * Even with normal interrupts enabled. An NMI should not be 1592 * Even with normal interrupts enabled. An NMI should not be
1657 * setting NEED_RESCHED or anything that normal interrupts and 1593 * setting NEED_RESCHED or anything that normal interrupts and
1658 * exceptions might do. 1594 * exceptions might do.
1659 */ 1595 */
1660 call save_paranoid 1596 call paranoid_entry
1661 DEFAULT_FRAME 0 1597 DEFAULT_FRAME 0
1662 1598
1663 /* 1599 /*
@@ -1688,8 +1624,10 @@ end_repeat_nmi:
1688nmi_swapgs: 1624nmi_swapgs:
1689 SWAPGS_UNSAFE_STACK 1625 SWAPGS_UNSAFE_STACK
1690nmi_restore: 1626nmi_restore:
1627 RESTORE_EXTRA_REGS
1628 RESTORE_C_REGS
1691 /* Pop the extra iret frame at once */ 1629 /* Pop the extra iret frame at once */
1692 RESTORE_ALL 6*8 1630 REMOVE_PT_GPREGS_FROM_STACK 6*8
1693 1631
1694 /* Clear the NMI executing stack variable */ 1632 /* Clear the NMI executing stack variable */
1695 movq $0, 5*8(%rsp) 1633 movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c4f8d4659070..2b55ee6db053 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -177,9 +177,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
177 */ 177 */
178 load_ucode_bsp(); 178 load_ucode_bsp();
179 179
180 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
181 early_printk("Kernel alive\n");
182
183 clear_page(init_level4_pgt); 180 clear_page(init_level4_pgt);
184 /* set init_level4_pgt kernel high mapping*/ 181 /* set init_level4_pgt kernel high mapping*/
185 init_level4_pgt[511] = early_level4_pgt[511]; 182 init_level4_pgt[511] = early_level4_pgt[511];
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42d6f0c..d031bad9e07e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/percpu.h> 23#include <asm/percpu.h>
24#include <asm/nops.h> 24#include <asm/nops.h>
25#include <asm/bootparam.h>
25 26
26/* Physical address */ 27/* Physical address */
27#define pa(X) ((X) - __PAGE_OFFSET) 28#define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
90 91
91 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 92 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
92 us to not reload segments */ 93 us to not reload segments */
93 testb $(1<<6), BP_loadflags(%esi) 94 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
94 jnz 2f 95 jnz 2f
95 96
96/* 97/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6fd514d9f69a..ae6588b301c2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit 2 * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
3 * 3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
56 * %rsi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
57 * 57 *
58 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
59 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86/boot/compressed/head_64.S.
60 * 60 *
61 * We only come here initially at boot nothing else comes here. 61 * We only come here initially at boot nothing else comes here.
62 * 62 *
@@ -146,7 +146,7 @@ startup_64:
146 leaq level2_kernel_pgt(%rip), %rdi 146 leaq level2_kernel_pgt(%rip), %rdi
147 leaq 4096(%rdi), %r8 147 leaq 4096(%rdi), %r8
148 /* See if it is a valid page table entry */ 148 /* See if it is a valid page table entry */
1491: testq $1, 0(%rdi) 1491: testb $1, 0(%rdi)
150 jz 2f 150 jz 2f
151 addq %rbp, 0(%rdi) 151 addq %rbp, 0(%rdi)
152 /* Go to the next page */ 152 /* Go to the next page */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fce0b71..009183276bb7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -42,8 +42,8 @@ void kernel_fpu_enable(void)
42 * be set (so that the clts/stts pair does nothing that is 42 * be set (so that the clts/stts pair does nothing that is
43 * visible in the interrupted kernel thread). 43 * visible in the interrupted kernel thread).
44 * 44 *
45 * Except for the eagerfpu case when we return 1 unless we've already 45 * Except for the eagerfpu case when we return true; in the likely case
46 * been eager and saved the state in kernel_fpu_begin(). 46 * the thread has FPU but we are not going to set/clear TS.
47 */ 47 */
48static inline bool interrupted_kernel_fpu_idle(void) 48static inline bool interrupted_kernel_fpu_idle(void)
49{ 49{
@@ -51,7 +51,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
51 return false; 51 return false;
52 52
53 if (use_eager_fpu()) 53 if (use_eager_fpu())
54 return __thread_has_fpu(current); 54 return true;
55 55
56 return !__thread_has_fpu(current) && 56 return !__thread_has_fpu(current) &&
57 (read_cr0() & X86_CR0_TS); 57 (read_cr0() & X86_CR0_TS);
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
68static inline bool interrupted_user_mode(void) 68static inline bool interrupted_user_mode(void)
69{ 69{
70 struct pt_regs *regs = get_irq_regs(); 70 struct pt_regs *regs = get_irq_regs();
71 return regs && user_mode_vm(regs); 71 return regs && user_mode(regs);
72} 72}
73 73
74/* 74/*
@@ -94,9 +94,10 @@ void __kernel_fpu_begin(void)
94 94
95 if (__thread_has_fpu(me)) { 95 if (__thread_has_fpu(me)) {
96 __save_init_fpu(me); 96 __save_init_fpu(me);
97 } else if (!use_eager_fpu()) { 97 } else {
98 this_cpu_write(fpu_owner_task, NULL); 98 this_cpu_write(fpu_owner_task, NULL);
99 clts(); 99 if (!use_eager_fpu())
100 clts();
100 } 101 }
101} 102}
102EXPORT_SYMBOL(__kernel_fpu_begin); 103EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -107,7 +108,7 @@ void __kernel_fpu_end(void)
107 108
108 if (__thread_has_fpu(me)) { 109 if (__thread_has_fpu(me)) {
109 if (WARN_ON(restore_fpu_checking(me))) 110 if (WARN_ON(restore_fpu_checking(me)))
110 drop_init_fpu(me); 111 fpu_reset_state(me);
111 } else if (!use_eager_fpu()) { 112 } else if (!use_eager_fpu()) {
112 stts(); 113 stts();
113 } 114 }
@@ -120,10 +121,13 @@ void unlazy_fpu(struct task_struct *tsk)
120{ 121{
121 preempt_disable(); 122 preempt_disable();
122 if (__thread_has_fpu(tsk)) { 123 if (__thread_has_fpu(tsk)) {
123 __save_init_fpu(tsk); 124 if (use_eager_fpu()) {
124 __thread_fpu_end(tsk); 125 __save_fpu(tsk);
125 } else 126 } else {
126 tsk->thread.fpu_counter = 0; 127 __save_init_fpu(tsk);
128 __thread_fpu_end(tsk);
129 }
130 }
127 preempt_enable(); 131 preempt_enable();
128} 132}
129EXPORT_SYMBOL(unlazy_fpu); 133EXPORT_SYMBOL(unlazy_fpu);
@@ -221,11 +225,12 @@ void fpu_finit(struct fpu *fpu)
221 return; 225 return;
222 } 226 }
223 227
228 memset(fpu->state, 0, xstate_size);
229
224 if (cpu_has_fxsr) { 230 if (cpu_has_fxsr) {
225 fx_finit(&fpu->state->fxsave); 231 fx_finit(&fpu->state->fxsave);
226 } else { 232 } else {
227 struct i387_fsave_struct *fp = &fpu->state->fsave; 233 struct i387_fsave_struct *fp = &fpu->state->fsave;
228 memset(fp, 0, xstate_size);
229 fp->cwd = 0xffff037fu; 234 fp->cwd = 0xffff037fu;
230 fp->swd = 0xffff0000u; 235 fp->swd = 0xffff0000u;
231 fp->twd = 0xffffffffu; 236 fp->twd = 0xffffffffu;
@@ -247,7 +252,7 @@ int init_fpu(struct task_struct *tsk)
247 if (tsk_used_math(tsk)) { 252 if (tsk_used_math(tsk)) {
248 if (cpu_has_fpu && tsk == current) 253 if (cpu_has_fpu && tsk == current)
249 unlazy_fpu(tsk); 254 unlazy_fpu(tsk);
250 tsk->thread.fpu.last_cpu = ~0; 255 task_disable_lazy_fpu_restore(tsk);
251 return 0; 256 return 0;
252 } 257 }
253 258
@@ -336,6 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
336 unsigned int pos, unsigned int count, 341 unsigned int pos, unsigned int count,
337 void *kbuf, void __user *ubuf) 342 void *kbuf, void __user *ubuf)
338{ 343{
344 struct xsave_struct *xsave;
339 int ret; 345 int ret;
340 346
341 if (!cpu_has_xsave) 347 if (!cpu_has_xsave)
@@ -345,19 +351,19 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
345 if (ret) 351 if (ret)
346 return ret; 352 return ret;
347 353
354 xsave = &target->thread.fpu.state->xsave;
355
348 /* 356 /*
349 * Copy the 48bytes defined by the software first into the xstate 357 * Copy the 48bytes defined by the software first into the xstate
350 * memory layout in the thread struct, so that we can copy the entire 358 * memory layout in the thread struct, so that we can copy the entire
351 * xstateregs to the user using one user_regset_copyout(). 359 * xstateregs to the user using one user_regset_copyout().
352 */ 360 */
353 memcpy(&target->thread.fpu.state->fxsave.sw_reserved, 361 memcpy(&xsave->i387.sw_reserved,
354 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); 362 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
355
356 /* 363 /*
357 * Copy the xstate memory layout. 364 * Copy the xstate memory layout.
358 */ 365 */
359 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, 366 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
360 &target->thread.fpu.state->xsave, 0, -1);
361 return ret; 367 return ret;
362} 368}
363 369
@@ -365,8 +371,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
365 unsigned int pos, unsigned int count, 371 unsigned int pos, unsigned int count,
366 const void *kbuf, const void __user *ubuf) 372 const void *kbuf, const void __user *ubuf)
367{ 373{
374 struct xsave_struct *xsave;
368 int ret; 375 int ret;
369 struct xsave_hdr_struct *xsave_hdr;
370 376
371 if (!cpu_has_xsave) 377 if (!cpu_has_xsave)
372 return -ENODEV; 378 return -ENODEV;
@@ -375,22 +381,18 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
375 if (ret) 381 if (ret)
376 return ret; 382 return ret;
377 383
378 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 384 xsave = &target->thread.fpu.state->xsave;
379 &target->thread.fpu.state->xsave, 0, -1);
380 385
386 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
381 /* 387 /*
382 * mxcsr reserved bits must be masked to zero for security reasons. 388 * mxcsr reserved bits must be masked to zero for security reasons.
383 */ 389 */
384 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; 390 xsave->i387.mxcsr &= mxcsr_feature_mask;
385 391 xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
386 xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
387
388 xsave_hdr->xstate_bv &= pcntxt_mask;
389 /* 392 /*
390 * These bits must be zero. 393 * These bits must be zero.
391 */ 394 */
392 memset(xsave_hdr->reserved, 0, 48); 395 memset(&xsave->xsave_hdr.reserved, 0, 48);
393
394 return ret; 396 return ret;
395} 397}
396 398
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66ea35f..37dae792dbbe 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
54 * because the ->io_bitmap_max value must match the bitmap 54 * because the ->io_bitmap_max value must match the bitmap
55 * contents: 55 * contents:
56 */ 56 */
57 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(cpu_tss, get_cpu());
58 58
59 if (turn_on) 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num); 60 bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 67b1cbe0093a..e5952c225532 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void)
295 295
296 this_cpu = smp_processor_id(); 296 this_cpu = smp_processor_id();
297 cpumask_copy(&online_new, cpu_online_mask); 297 cpumask_copy(&online_new, cpu_online_mask);
298 cpu_clear(this_cpu, online_new); 298 cpumask_clear_cpu(this_cpu, &online_new);
299 299
300 this_count = 0; 300 this_count = 0;
301 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 301 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void)
307 307
308 data = irq_desc_get_irq_data(desc); 308 data = irq_desc_get_irq_data(desc);
309 cpumask_copy(&affinity_new, data->affinity); 309 cpumask_copy(&affinity_new, data->affinity);
310 cpu_clear(this_cpu, affinity_new); 310 cpumask_clear_cpu(this_cpu, &affinity_new);
311 311
312 /* Do not count inactive or per-cpu irqs. */ 312 /* Do not count inactive or per-cpu irqs. */
313 if (!irq_has_action(irq) || irqd_is_per_cpu(data)) 313 if (!irq_has_action(irq) || irqd_is_per_cpu(data))
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 28d28f5eb8f4..f9fd86a7fcc7 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
165 if (unlikely(!desc)) 165 if (unlikely(!desc))
166 return false; 166 return false;
167 167
168 if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { 168 if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
169 if (unlikely(overflow)) 169 if (unlikely(overflow))
170 print_stack_overflow(); 170 print_stack_overflow();
171 desc->handle_irq(irq, desc); 171 desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e4b503d5558c..394e643d7830 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
44 u64 estack_top, estack_bottom; 44 u64 estack_top, estack_bottom;
45 u64 curbase = (u64)task_stack_page(current); 45 u64 curbase = (u64)task_stack_page(current);
46 46
47 if (user_mode_vm(regs)) 47 if (user_mode(regs))
48 return; 48 return;
49 49
50 if (regs->sp >= curbase + sizeof(struct thread_info) + 50 if (regs->sp >= curbase + sizeof(struct thread_info) +
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181ea1eac..cd10a6437264 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void)
178#endif 178#endif
179 for_each_clear_bit_from(i, used_vectors, first_system_vector) { 179 for_each_clear_bit_from(i, used_vectors, first_system_vector) {
180 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ 180 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
181 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 181 set_intr_gate(i, irq_entries_start +
182 8 * (i - FIRST_EXTERNAL_VECTOR));
182 } 183 }
183#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
184 for_each_clear_bit_from(i, used_vectors, NR_VECTORS) 185 for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 25ecd56cefa8..d6178d9791db 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
126#ifdef CONFIG_X86_32 126#ifdef CONFIG_X86_32
127 switch (regno) { 127 switch (regno) {
128 case GDB_SS: 128 case GDB_SS:
129 if (!user_mode_vm(regs)) 129 if (!user_mode(regs))
130 *(unsigned long *)mem = __KERNEL_DS; 130 *(unsigned long *)mem = __KERNEL_DS;
131 break; 131 break;
132 case GDB_SP: 132 case GDB_SP:
133 if (!user_mode_vm(regs)) 133 if (!user_mode(regs))
134 *(unsigned long *)mem = kernel_stack_pointer(regs); 134 *(unsigned long *)mem = kernel_stack_pointer(regs);
135 break; 135 break;
136 case GDB_GS: 136 case GDB_GS:
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4e3d5a9621fe..1deffe6cc873 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src)
354{ 354{
355 struct insn insn; 355 struct insn insn;
356 kprobe_opcode_t buf[MAX_INSN_SIZE]; 356 kprobe_opcode_t buf[MAX_INSN_SIZE];
357 int length;
357 unsigned long recovered_insn = 358 unsigned long recovered_insn =
358 recover_probed_instruction(buf, (unsigned long)src); 359 recover_probed_instruction(buf, (unsigned long)src);
359 360
@@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src)
361 return 0; 362 return 0;
362 kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); 363 kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
363 insn_get_length(&insn); 364 insn_get_length(&insn);
365 length = insn.length;
366
364 /* Another subsystem puts a breakpoint, failed to recover */ 367 /* Another subsystem puts a breakpoint, failed to recover */
365 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) 368 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
366 return 0; 369 return 0;
367 memcpy(dest, insn.kaddr, insn.length); 370 memcpy(dest, insn.kaddr, length);
368 371
369#ifdef CONFIG_X86_64 372#ifdef CONFIG_X86_64
370 if (insn_rip_relative(&insn)) { 373 if (insn_rip_relative(&insn)) {
371 s64 newdisp; 374 s64 newdisp;
372 u8 *disp; 375 u8 *disp;
373 kernel_insn_init(&insn, dest, insn.length); 376 kernel_insn_init(&insn, dest, length);
374 insn_get_displacement(&insn); 377 insn_get_displacement(&insn);
375 /* 378 /*
376 * The copied instruction uses the %rip-relative addressing 379 * The copied instruction uses the %rip-relative addressing
@@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src)
394 *(s32 *) disp = (s32) newdisp; 397 *(s32 *) disp = (s32) newdisp;
395 } 398 }
396#endif 399#endif
397 return insn.length; 400 return length;
398} 401}
399 402
400static int arch_copy_kprobe(struct kprobe *p) 403static int arch_copy_kprobe(struct kprobe *p)
@@ -602,7 +605,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
602 struct kprobe *p; 605 struct kprobe *p;
603 struct kprobe_ctlblk *kcb; 606 struct kprobe_ctlblk *kcb;
604 607
605 if (user_mode_vm(regs)) 608 if (user_mode(regs))
606 return 0; 609 return 0;
607 610
608 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 611 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1007,7 +1010,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
1007 struct die_args *args = data; 1010 struct die_args *args = data;
1008 int ret = NOTIFY_DONE; 1011 int ret = NOTIFY_DONE;
1009 1012
1010 if (args->regs && user_mode_vm(args->regs)) 1013 if (args->regs && user_mode(args->regs))
1011 return ret; 1014 return ret;
1012 1015
1013 if (val == DIE_GPF) { 1016 if (val == DIE_GPF) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
513 * can get false positives too easily, for example if the host is 513 * can get false positives too easily, for example if the host is
514 * overcommitted. 514 * overcommitted.
515 */ 515 */
516 watchdog_enable_hardlockup_detector(false); 516 hardlockup_detector_disable();
517} 517}
518 518
519static noinline uint32_t __kvm_cpuid_base(void) 519static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index d1ac80b72c72..005c03e93fc5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -33,6 +33,7 @@
33 33
34#include <asm/page.h> 34#include <asm/page.h>
35#include <asm/pgtable.h> 35#include <asm/pgtable.h>
36#include <asm/setup.h>
36 37
37#if 0 38#if 0
38#define DEBUGP(fmt, ...) \ 39#define DEBUGP(fmt, ...) \
@@ -47,21 +48,13 @@ do { \
47 48
48#ifdef CONFIG_RANDOMIZE_BASE 49#ifdef CONFIG_RANDOMIZE_BASE
49static unsigned long module_load_offset; 50static unsigned long module_load_offset;
50static int randomize_modules = 1;
51 51
52/* Mutex protects the module_load_offset. */ 52/* Mutex protects the module_load_offset. */
53static DEFINE_MUTEX(module_kaslr_mutex); 53static DEFINE_MUTEX(module_kaslr_mutex);
54 54
55static int __init parse_nokaslr(char *p)
56{
57 randomize_modules = 0;
58 return 0;
59}
60early_param("nokaslr", parse_nokaslr);
61
62static unsigned long int get_module_load_offset(void) 55static unsigned long int get_module_load_offset(void)
63{ 56{
64 if (randomize_modules) { 57 if (kaslr_enabled()) {
65 mutex_lock(&module_kaslr_mutex); 58 mutex_lock(&module_kaslr_mutex);
66 /* 59 /*
67 * Calculate the module_load_offset the first time this 60 * Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..c614dd492f5f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
443 .ptep_modify_prot_start = __ptep_modify_prot_start, 443 .ptep_modify_prot_start = __ptep_modify_prot_start,
444 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 444 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
445 445
446#if PAGETABLE_LEVELS >= 3 446#if CONFIG_PGTABLE_LEVELS >= 3
447#ifdef CONFIG_X86_PAE 447#ifdef CONFIG_X86_PAE
448 .set_pte_atomic = native_set_pte_atomic, 448 .set_pte_atomic = native_set_pte_atomic,
449 .pte_clear = native_pte_clear, 449 .pte_clear = native_pte_clear,
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = {
454 .pmd_val = PTE_IDENT, 454 .pmd_val = PTE_IDENT,
455 .make_pmd = PTE_IDENT, 455 .make_pmd = PTE_IDENT,
456 456
457#if PAGETABLE_LEVELS == 4 457#if CONFIG_PGTABLE_LEVELS == 4
458 .pud_val = PTE_IDENT, 458 .pud_val = PTE_IDENT,
459 .make_pud = PTE_IDENT, 459 .make_pud = PTE_IDENT,
460 460
461 .set_pgd = native_set_pgd, 461 .set_pgd = native_set_pgd,
462#endif 462#endif
463#endif /* PAGETABLE_LEVELS >= 3 */ 463#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
464 464
465 .pte_val = PTE_IDENT, 465 .pte_val = PTE_IDENT,
466 .pgd_val = PTE_IDENT, 466 .pgd_val = PTE_IDENT,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861cc5ee8..da8cb987b973 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user,
131 } 131 }
132 132
133 /* 133 /*
134 * RIP, flags, and the argument registers are usually saved. 134 * These registers are always saved on 64-bit syscall entry.
135 * orig_ax is probably okay, too. 135 * On 32-bit entry points, they are saved too except r8..r11.
136 */ 136 */
137 regs_user_copy->ip = user_regs->ip; 137 regs_user_copy->ip = user_regs->ip;
138 regs_user_copy->ax = user_regs->ax;
138 regs_user_copy->cx = user_regs->cx; 139 regs_user_copy->cx = user_regs->cx;
139 regs_user_copy->dx = user_regs->dx; 140 regs_user_copy->dx = user_regs->dx;
140 regs_user_copy->si = user_regs->si; 141 regs_user_copy->si = user_regs->si;
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
145 regs_user_copy->r11 = user_regs->r11; 146 regs_user_copy->r11 = user_regs->r11;
146 regs_user_copy->orig_ax = user_regs->orig_ax; 147 regs_user_copy->orig_ax = user_regs->orig_ax;
147 regs_user_copy->flags = user_regs->flags; 148 regs_user_copy->flags = user_regs->flags;
149 regs_user_copy->sp = user_regs->sp;
150 regs_user_copy->cs = user_regs->cs;
151 regs_user_copy->ss = user_regs->ss;
148 152
149 /* 153 /*
150 * Don't even try to report the "rest" regs. 154 * Most system calls don't save these registers, don't report them.
151 */ 155 */
152 regs_user_copy->bx = -1; 156 regs_user_copy->bx = -1;
153 regs_user_copy->bp = -1; 157 regs_user_copy->bp = -1;
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user,
158 162
159 /* 163 /*
160 * For this to be at all useful, we need a reasonable guess for 164 * For this to be at all useful, we need a reasonable guess for
161 * sp and the ABI. Be careful: we're in NMI context, and we're 165 * the ABI. Be careful: we're in NMI context, and we're
162 * considering current to be the current task, so we should 166 * considering current to be the current task, so we should
163 * be careful not to look at any other percpu variables that might 167 * be careful not to look at any other percpu variables that might
164 * change during context switches. 168 * change during context switches.
165 */ 169 */
166 if (IS_ENABLED(CONFIG_IA32_EMULATION) && 170 regs_user->abi = user_64bit_mode(user_regs) ?
167 task_thread_info(current)->status & TS_COMPAT) { 171 PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
168 /* Easy case: we're in a compat syscall. */
169 regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
170 regs_user_copy->sp = user_regs->sp;
171 regs_user_copy->cs = user_regs->cs;
172 regs_user_copy->ss = user_regs->ss;
173 } else if (user_regs->orig_ax != -1) {
174 /*
175 * We're probably in a 64-bit syscall.
176 * Warning: this code is severely racy. At least it's better
177 * than just blindly copying user_regs.
178 */
179 regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
180 regs_user_copy->sp = this_cpu_read(old_rsp);
181 regs_user_copy->cs = __USER_CS;
182 regs_user_copy->ss = __USER_DS;
183 regs_user_copy->cx = -1; /* usually contains garbage */
184 } else {
185 /* We're probably in an interrupt or exception. */
186 regs_user->abi = user_64bit_mode(user_regs) ?
187 PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
188 regs_user_copy->sp = user_regs->sp;
189 regs_user_copy->cs = user_regs->cs;
190 regs_user_copy->ss = user_regs->ss;
191 }
192 172
193 regs_user->regs = regs_user_copy; 173 regs_user->regs = regs_user_copy;
194} 174}
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 000000000000..3420c874ddc5
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,53 @@
1/*
2 * Copyright (c) 2015, Christoph Hellwig.
3 */
4#include <linux/memblock.h>
5#include <linux/platform_device.h>
6#include <linux/slab.h>
7#include <asm/e820.h>
8#include <asm/page_types.h>
9#include <asm/setup.h>
10
11static __init void register_pmem_device(struct resource *res)
12{
13 struct platform_device *pdev;
14 int error;
15
16 pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
17 if (!pdev)
18 return;
19
20 error = platform_device_add_resources(pdev, res, 1);
21 if (error)
22 goto out_put_pdev;
23
24 error = platform_device_add(pdev);
25 if (error)
26 goto out_put_pdev;
27 return;
28
29out_put_pdev:
30 dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
31 platform_device_put(pdev);
32}
33
34static __init int register_pmem_devices(void)
35{
36 int i;
37
38 for (i = 0; i < e820.nr_map; i++) {
39 struct e820entry *ei = &e820.map[i];
40
41 if (ei->type == E820_PRAM) {
42 struct resource res = {
43 .flags = IORESOURCE_MEM,
44 .start = ei->addr,
45 .end = ei->addr + ei->size - 1,
46 };
47 register_pmem_device(&res);
48 }
49 }
50
51 return 0;
52}
53device_initcall(register_pmem_devices);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d620bbe..8213da62b1b7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/pm.h> 11#include <linux/pm.h>
12#include <linux/clockchips.h> 12#include <linux/tick.h>
13#include <linux/random.h> 13#include <linux/random.h>
14#include <linux/user-return-notifier.h> 14#include <linux/user-return-notifier.h>
15#include <linux/dmi.h> 15#include <linux/dmi.h>
@@ -24,6 +24,7 @@
24#include <asm/syscalls.h> 24#include <asm/syscalls.h>
25#include <asm/idle.h> 25#include <asm/idle.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27#include <asm/mwait.h>
27#include <asm/i387.h> 28#include <asm/i387.h>
28#include <asm/fpu-internal.h> 29#include <asm/fpu-internal.h>
29#include <asm/debugreg.h> 30#include <asm/debugreg.h>
@@ -37,7 +38,26 @@
37 * section. Since TSS's are completely CPU-local, we want them 38 * section. Since TSS's are completely CPU-local, we want them
38 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 39 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
39 */ 40 */
40__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; 41__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
42 .x86_tss = {
43 .sp0 = TOP_OF_INIT_STACK,
44#ifdef CONFIG_X86_32
45 .ss0 = __KERNEL_DS,
46 .ss1 = __KERNEL_CS,
47 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
48#endif
49 },
50#ifdef CONFIG_X86_32
51 /*
52 * Note that the .io_bitmap member must be extra-big. This is because
53 * the CPU will access an additional byte beyond the end of the IO
54 * permission bitmap. The extra byte must be all 1 bits, and must
55 * be within the limit.
56 */
57 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
58#endif
59};
60EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
41 61
42#ifdef CONFIG_X86_64 62#ifdef CONFIG_X86_64
43static DEFINE_PER_CPU(unsigned char, is_idle); 63static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -69,8 +89,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
69 89
70 dst->thread.fpu_counter = 0; 90 dst->thread.fpu_counter = 0;
71 dst->thread.fpu.has_fpu = 0; 91 dst->thread.fpu.has_fpu = 0;
72 dst->thread.fpu.last_cpu = ~0;
73 dst->thread.fpu.state = NULL; 92 dst->thread.fpu.state = NULL;
93 task_disable_lazy_fpu_restore(dst);
74 if (tsk_used_math(src)) { 94 if (tsk_used_math(src)) {
75 int err = fpu_alloc(&dst->thread.fpu); 95 int err = fpu_alloc(&dst->thread.fpu);
76 if (err) 96 if (err)
@@ -109,7 +129,7 @@ void exit_thread(void)
109 unsigned long *bp = t->io_bitmap_ptr; 129 unsigned long *bp = t->io_bitmap_ptr;
110 130
111 if (bp) { 131 if (bp) {
112 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 132 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
113 133
114 t->io_bitmap_ptr = NULL; 134 t->io_bitmap_ptr = NULL;
115 clear_thread_flag(TIF_IO_BITMAP); 135 clear_thread_flag(TIF_IO_BITMAP);
@@ -131,13 +151,18 @@ void flush_thread(void)
131 151
132 flush_ptrace_hw_breakpoint(tsk); 152 flush_ptrace_hw_breakpoint(tsk);
133 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 153 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
134 drop_init_fpu(tsk); 154
135 /* 155 if (!use_eager_fpu()) {
136 * Free the FPU state for non xsave platforms. They get reallocated 156 /* FPU state will be reallocated lazily at the first use. */
137 * lazily at the first use. 157 drop_fpu(tsk);
138 */
139 if (!use_eager_fpu())
140 free_thread_xstate(tsk); 158 free_thread_xstate(tsk);
159 } else if (!used_math()) {
160 /* kthread execs. TODO: cleanup this horror. */
161 if (WARN_ON(init_fpu(tsk)))
162 force_sig(SIGKILL, tsk);
163 user_fpu_begin();
164 restore_init_xstate();
165 }
141} 166}
142 167
143static void hard_disable_TSC(void) 168static void hard_disable_TSC(void)
@@ -377,14 +402,11 @@ static void amd_e400_idle(void)
377 402
378 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 403 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
379 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 404 cpumask_set_cpu(cpu, amd_e400_c1e_mask);
380 /* 405 /* Force broadcast so ACPI can not interfere. */
381 * Force broadcast so ACPI can not interfere. 406 tick_broadcast_force();
382 */
383 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
384 &cpu);
385 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 407 pr_info("Switch to broadcast mode on CPU%d\n", cpu);
386 } 408 }
387 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 409 tick_broadcast_enter();
388 410
389 default_idle(); 411 default_idle();
390 412
@@ -393,12 +415,59 @@ static void amd_e400_idle(void)
393 * called with interrupts disabled. 415 * called with interrupts disabled.
394 */ 416 */
395 local_irq_disable(); 417 local_irq_disable();
396 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 418 tick_broadcast_exit();
397 local_irq_enable(); 419 local_irq_enable();
398 } else 420 } else
399 default_idle(); 421 default_idle();
400} 422}
401 423
424/*
425 * Intel Core2 and older machines prefer MWAIT over HALT for C1.
426 * We can't rely on cpuidle installing MWAIT, because it will not load
427 * on systems that support only C1 -- so the boot default must be MWAIT.
428 *
429 * Some AMD machines are the opposite, they depend on using HALT.
430 *
431 * So for default C1, which is used during boot until cpuidle loads,
432 * use MWAIT-C1 on Intel HW that has it, else use HALT.
433 */
434static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
435{
436 if (c->x86_vendor != X86_VENDOR_INTEL)
437 return 0;
438
439 if (!cpu_has(c, X86_FEATURE_MWAIT))
440 return 0;
441
442 return 1;
443}
444
445/*
446 * MONITOR/MWAIT with no hints, used for default default C1 state.
447 * This invokes MWAIT with interrutps enabled and no flags,
448 * which is backwards compatible with the original MWAIT implementation.
449 */
450
451static void mwait_idle(void)
452{
453 if (!current_set_polling_and_test()) {
454 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
455 smp_mb(); /* quirk */
456 clflush((void *)&current_thread_info()->flags);
457 smp_mb(); /* quirk */
458 }
459
460 __monitor((void *)&current_thread_info()->flags, 0, 0);
461 if (!need_resched())
462 __sti_mwait(0, 0);
463 else
464 local_irq_enable();
465 } else {
466 local_irq_enable();
467 }
468 __current_clr_polling();
469}
470
402void select_idle_routine(const struct cpuinfo_x86 *c) 471void select_idle_routine(const struct cpuinfo_x86 *c)
403{ 472{
404#ifdef CONFIG_SMP 473#ifdef CONFIG_SMP
@@ -412,6 +481,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
412 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 481 /* E400: APIC timer interrupt does not wake up CPU from C1e */
413 pr_info("using AMD E400 aware idle routine\n"); 482 pr_info("using AMD E400 aware idle routine\n");
414 x86_idle = amd_e400_idle; 483 x86_idle = amd_e400_idle;
484 } else if (prefer_mwait_c1_over_halt(c)) {
485 pr_info("using mwait in idle threads\n");
486 x86_idle = mwait_idle;
415 } else 487 } else
416 x86_idle = default_idle; 488 x86_idle = default_idle;
417} 489}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f99cb5a..8ed2106b06da 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all)
73 unsigned long sp; 73 unsigned long sp;
74 unsigned short ss, gs; 74 unsigned short ss, gs;
75 75
76 if (user_mode_vm(regs)) { 76 if (user_mode(regs)) {
77 sp = regs->sp; 77 sp = regs->sp;
78 ss = regs->ss & 0xffff; 78 ss = regs->ss & 0xffff;
79 gs = get_user_gs(regs); 79 gs = get_user_gs(regs);
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
206 regs->ip = new_ip; 206 regs->ip = new_ip;
207 regs->sp = new_sp; 207 regs->sp = new_sp;
208 regs->flags = X86_EFLAGS_IF; 208 regs->flags = X86_EFLAGS_IF;
209 /* 209 force_iret();
210 * force it to the iret return path by making it look as if there was
211 * some work pending.
212 */
213 set_thread_flag(TIF_NOTIFY_RESUME);
214} 210}
215EXPORT_SYMBOL_GPL(start_thread); 211EXPORT_SYMBOL_GPL(start_thread);
216 212
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
248 struct thread_struct *prev = &prev_p->thread, 244 struct thread_struct *prev = &prev_p->thread,
249 *next = &next_p->thread; 245 *next = &next_p->thread;
250 int cpu = smp_processor_id(); 246 int cpu = smp_processor_id();
251 struct tss_struct *tss = &per_cpu(init_tss, cpu); 247 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
252 fpu_switch_t fpu; 248 fpu_switch_t fpu;
253 249
254 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 250 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
256 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 252 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
257 253
258 /* 254 /*
259 * Reload esp0.
260 */
261 load_sp0(tss, next);
262
263 /*
264 * Save away %gs. No need to save %fs, as it was saved on the 255 * Save away %gs. No need to save %fs, as it was saved on the
265 * stack on entry. No need to save %es and %ds, as those are 256 * stack on entry. No need to save %es and %ds, as those are
266 * always kernel segments while inside the kernel. Doing this 257 * always kernel segments while inside the kernel. Doing this
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
310 */ 301 */
311 arch_end_context_switch(next_p); 302 arch_end_context_switch(next_p);
312 303
304 /*
305 * Reload esp0, kernel_stack, and current_top_of_stack. This changes
306 * current_thread_info().
307 */
308 load_sp0(tss, next);
313 this_cpu_write(kernel_stack, 309 this_cpu_write(kernel_stack,
314 (unsigned long)task_stack_page(next_p) + 310 (unsigned long)task_stack_page(next_p) +
315 THREAD_SIZE - KERNEL_STACK_OFFSET); 311 THREAD_SIZE);
312 this_cpu_write(cpu_current_top_of_stack,
313 (unsigned long)task_stack_page(next_p) +
314 THREAD_SIZE);
316 315
317 /* 316 /*
318 * Restore %gs if needed (which is common) 317 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43577d2..4baaa972f52a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
52 52
53asmlinkage extern void ret_from_fork(void); 53asmlinkage extern void ret_from_fork(void);
54 54
55__visible DEFINE_PER_CPU(unsigned long, old_rsp); 55__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
56 56
57/* Prints also some state that isn't saved in the pt_regs */ 57/* Prints also some state that isn't saved in the pt_regs */
58void __show_regs(struct pt_regs *regs, int all) 58void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
161 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; 161 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162 childregs = task_pt_regs(p); 162 childregs = task_pt_regs(p);
163 p->thread.sp = (unsigned long) childregs; 163 p->thread.sp = (unsigned long) childregs;
164 p->thread.usersp = me->thread.usersp;
165 set_tsk_thread_flag(p, TIF_FORK); 164 set_tsk_thread_flag(p, TIF_FORK);
166 p->thread.io_bitmap_ptr = NULL; 165 p->thread.io_bitmap_ptr = NULL;
167 166
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
207 */ 206 */
208 if (clone_flags & CLONE_SETTLS) { 207 if (clone_flags & CLONE_SETTLS) {
209#ifdef CONFIG_IA32_EMULATION 208#ifdef CONFIG_IA32_EMULATION
210 if (test_thread_flag(TIF_IA32)) 209 if (is_ia32_task())
211 err = do_set_thread_area(p, -1, 210 err = do_set_thread_area(p, -1,
212 (struct user_desc __user *)childregs->si, 0); 211 (struct user_desc __user *)childregs->si, 0);
213 else 212 else
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
235 loadsegment(es, _ds); 234 loadsegment(es, _ds);
236 loadsegment(ds, _ds); 235 loadsegment(ds, _ds);
237 load_gs_index(0); 236 load_gs_index(0);
238 current->thread.usersp = new_sp;
239 regs->ip = new_ip; 237 regs->ip = new_ip;
240 regs->sp = new_sp; 238 regs->sp = new_sp;
241 this_cpu_write(old_rsp, new_sp);
242 regs->cs = _cs; 239 regs->cs = _cs;
243 regs->ss = _ss; 240 regs->ss = _ss;
244 regs->flags = X86_EFLAGS_IF; 241 regs->flags = X86_EFLAGS_IF;
242 force_iret();
245} 243}
246 244
247void 245void
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
277 struct thread_struct *prev = &prev_p->thread; 275 struct thread_struct *prev = &prev_p->thread;
278 struct thread_struct *next = &next_p->thread; 276 struct thread_struct *next = &next_p->thread;
279 int cpu = smp_processor_id(); 277 int cpu = smp_processor_id();
280 struct tss_struct *tss = &per_cpu(init_tss, cpu); 278 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
281 unsigned fsindex, gsindex; 279 unsigned fsindex, gsindex;
282 fpu_switch_t fpu; 280 fpu_switch_t fpu;
283 281
284 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 282 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
285 283
286 /* Reload esp0 and ss1. */
287 load_sp0(tss, next);
288
289 /* We must save %fs and %gs before load_TLS() because 284 /* We must save %fs and %gs before load_TLS() because
290 * %fs and %gs may be cleared by load_TLS(). 285 * %fs and %gs may be cleared by load_TLS().
291 * 286 *
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
401 /* 396 /*
402 * Switch the PDA and FPU contexts. 397 * Switch the PDA and FPU contexts.
403 */ 398 */
404 prev->usersp = this_cpu_read(old_rsp);
405 this_cpu_write(old_rsp, next->usersp);
406 this_cpu_write(current_task, next_p); 399 this_cpu_write(current_task, next_p);
407 400
408 /* 401 /*
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
413 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); 406 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
414 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); 407 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
415 408
409 /* Reload esp0 and ss1. This changes current_thread_info(). */
410 load_sp0(tss, next);
411
416 this_cpu_write(kernel_stack, 412 this_cpu_write(kernel_stack,
417 (unsigned long)task_stack_page(next_p) + 413 (unsigned long)task_stack_page(next_p) + THREAD_SIZE);
418 THREAD_SIZE - KERNEL_STACK_OFFSET);
419 414
420 /* 415 /*
421 * Now maybe reload the debug registers and handle I/O bitmaps 416 * Now maybe reload the debug registers and handle I/O bitmaps
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr)
602 597
603unsigned long KSTK_ESP(struct task_struct *task) 598unsigned long KSTK_ESP(struct task_struct *task)
604{ 599{
605 return (test_tsk_thread_flag(task, TIF_IA32)) ? 600 return task_pt_regs(task)->sp;
606 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
607} 601}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618b2e91..a7bc79480719 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
364 case offsetof(struct user_regs_struct,cs): 364 case offsetof(struct user_regs_struct,cs):
365 if (unlikely(value == 0)) 365 if (unlikely(value == 0))
366 return -EIO; 366 return -EIO;
367#ifdef CONFIG_IA32_EMULATION 367 task_pt_regs(task)->cs = value;
368 if (test_tsk_thread_flag(task, TIF_IA32))
369 task_pt_regs(task)->cs = value;
370#endif
371 break; 368 break;
372 case offsetof(struct user_regs_struct,ss): 369 case offsetof(struct user_regs_struct,ss):
373 if (unlikely(value == 0)) 370 if (unlikely(value == 0))
374 return -EIO; 371 return -EIO;
375#ifdef CONFIG_IA32_EMULATION 372 task_pt_regs(task)->ss = value;
376 if (test_tsk_thread_flag(task, TIF_IA32))
377 task_pt_regs(task)->ss = value;
378#endif
379 break; 373 break;
380 } 374 }
381 375
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
1421 memset(info, 0, sizeof(*info)); 1415 memset(info, 0, sizeof(*info));
1422 info->si_signo = SIGTRAP; 1416 info->si_signo = SIGTRAP;
1423 info->si_code = si_code; 1417 info->si_code = si_code;
1424 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; 1418 info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
1425} 1419}
1426 1420
1427void user_single_step_siginfo(struct task_struct *tsk, 1421void user_single_step_siginfo(struct task_struct *tsk,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d229a58..e5ecd20e72dd 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
141 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 141 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
142} 142}
143 143
144static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
145
146static struct pvclock_vsyscall_time_info *
147pvclock_get_vsyscall_user_time_info(int cpu)
148{
149 if (!pvclock_vdso_info) {
150 BUG();
151 return NULL;
152 }
153
154 return &pvclock_vdso_info[cpu];
155}
156
157struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
158{
159 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
160}
161
144#ifdef CONFIG_X86_64 162#ifdef CONFIG_X86_64
163static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
164 void *v)
165{
166 struct task_migration_notifier *mn = v;
167 struct pvclock_vsyscall_time_info *pvti;
168
169 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
170
171 /* this is NULL when pvclock vsyscall is not initialized */
172 if (unlikely(pvti == NULL))
173 return NOTIFY_DONE;
174
175 pvti->migrate_count++;
176
177 return NOTIFY_DONE;
178}
179
180static struct notifier_block pvclock_migrate = {
181 .notifier_call = pvclock_task_migrate,
182};
183
145/* 184/*
146 * Initialize the generic pvclock vsyscall state. This will allocate 185 * Initialize the generic pvclock vsyscall state. This will allocate
147 * a/some page(s) for the per-vcpu pvclock information, set up a 186 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
155 194
156 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); 195 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
157 196
197 pvclock_vdso_info = i;
198
158 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 199 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
159 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 200 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
160 __pa(i) + (idx*PAGE_SIZE), 201 __pa(i) + (idx*PAGE_SIZE),
161 PAGE_KERNEL_VVAR); 202 PAGE_KERNEL_VVAR);
162 } 203 }
163 204
205
206 register_task_migration_notifier(&pvclock_migrate);
207
164 return 0; 208 return 0;
165} 209}
166#endif 210#endif
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7c22a6..77630d57e7bf 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
226 movl (%ebx), %ecx 226 movl (%ebx), %ecx
227 addl $4, %ebx 227 addl $4, %ebx
2281: 2281:
229 testl $0x1, %ecx /* is it a destination page */ 229 testb $0x1, %cl /* is it a destination page */
230 jz 2f 230 jz 2f
231 movl %ecx, %edi 231 movl %ecx, %edi
232 andl $0xfffff000, %edi 232 andl $0xfffff000, %edi
233 jmp 0b 233 jmp 0b
2342: 2342:
235 testl $0x2, %ecx /* is it an indirection page */ 235 testb $0x2, %cl /* is it an indirection page */
236 jz 2f 236 jz 2f
237 movl %ecx, %ebx 237 movl %ecx, %ebx
238 andl $0xfffff000, %ebx 238 andl $0xfffff000, %ebx
239 jmp 0b 239 jmp 0b
2402: 2402:
241 testl $0x4, %ecx /* is it the done indicator */ 241 testb $0x4, %cl /* is it the done indicator */
242 jz 2f 242 jz 2f
243 jmp 3f 243 jmp 3f
2442: 2442:
245 testl $0x8, %ecx /* is it the source indicator */ 245 testb $0x8, %cl /* is it the source indicator */
246 jz 0b /* Ignore it otherwise */ 246 jz 0b /* Ignore it otherwise */
247 movl %ecx, %esi /* For every source page do a copy */ 247 movl %ecx, %esi /* For every source page do a copy */
248 andl $0xfffff000, %esi 248 andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c693e475..98111b38ebfd 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -123,7 +123,7 @@ identity_mapped:
123 * Set cr4 to a known state: 123 * Set cr4 to a known state:
124 * - physical address extension enabled 124 * - physical address extension enabled
125 */ 125 */
126 movq $X86_CR4_PAE, %rax 126 movl $X86_CR4_PAE, %eax
127 movq %rax, %cr4 127 movq %rax, %cr4
128 128
129 jmp 1f 129 jmp 1f
@@ -221,23 +221,23 @@ swap_pages:
221 movq (%rbx), %rcx 221 movq (%rbx), %rcx
222 addq $8, %rbx 222 addq $8, %rbx
2231: 2231:
224 testq $0x1, %rcx /* is it a destination page? */ 224 testb $0x1, %cl /* is it a destination page? */
225 jz 2f 225 jz 2f
226 movq %rcx, %rdi 226 movq %rcx, %rdi
227 andq $0xfffffffffffff000, %rdi 227 andq $0xfffffffffffff000, %rdi
228 jmp 0b 228 jmp 0b
2292: 2292:
230 testq $0x2, %rcx /* is it an indirection page? */ 230 testb $0x2, %cl /* is it an indirection page? */
231 jz 2f 231 jz 2f
232 movq %rcx, %rbx 232 movq %rcx, %rbx
233 andq $0xfffffffffffff000, %rbx 233 andq $0xfffffffffffff000, %rbx
234 jmp 0b 234 jmp 0b
2352: 2352:
236 testq $0x4, %rcx /* is it the done indicator? */ 236 testb $0x4, %cl /* is it the done indicator? */
237 jz 2f 237 jz 2f
238 jmp 3f 238 jmp 3f
2392: 2392:
240 testq $0x8, %rcx /* is it the source indicator? */ 240 testb $0x8, %cl /* is it the source indicator? */
241 jz 0b /* Ignore it otherwise */ 241 jz 0b /* Ignore it otherwise */
242 movq %rcx, %rsi /* For ever source page do a copy */ 242 movq %rcx, %rsi /* For ever source page do a copy */
243 andq $0xfffffffffffff000, %rsi 243 andq $0xfffffffffffff000, %rsi
@@ -246,17 +246,17 @@ swap_pages:
246 movq %rsi, %rax 246 movq %rsi, %rax
247 247
248 movq %r10, %rdi 248 movq %r10, %rdi
249 movq $512, %rcx 249 movl $512, %ecx
250 rep ; movsq 250 rep ; movsq
251 251
252 movq %rax, %rdi 252 movq %rax, %rdi
253 movq %rdx, %rsi 253 movq %rdx, %rsi
254 movq $512, %rcx 254 movl $512, %ecx
255 rep ; movsq 255 rep ; movsq
256 256
257 movq %rdx, %rdi 257 movq %rdx, %rdi
258 movq %r10, %rsi 258 movq %r10, %rsi
259 movq $512, %rcx 259 movl $512, %ecx
260 rep ; movsq 260 rep ; movsq
261 261
262 lea PAGE_SIZE(%rax), %rsi 262 lea PAGE_SIZE(%rax), %rsi
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a2421cca01f..d74ac33290ae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -354,7 +354,7 @@ static void __init relocate_initrd(void)
354 mapaddr = ramdisk_image & PAGE_MASK; 354 mapaddr = ramdisk_image & PAGE_MASK;
355 p = early_memremap(mapaddr, clen+slop); 355 p = early_memremap(mapaddr, clen+slop);
356 memcpy(q, p+slop, clen); 356 memcpy(q, p+slop, clen);
357 early_iounmap(p, clen+slop); 357 early_memunmap(p, clen+slop);
358 q += clen; 358 q += clen;
359 ramdisk_image += clen; 359 ramdisk_image += clen;
360 ramdisk_size -= clen; 360 ramdisk_size -= clen;
@@ -438,7 +438,7 @@ static void __init parse_setup_data(void)
438 data_len = data->len + sizeof(struct setup_data); 438 data_len = data->len + sizeof(struct setup_data);
439 data_type = data->type; 439 data_type = data->type;
440 pa_next = data->next; 440 pa_next = data->next;
441 early_iounmap(data, sizeof(*data)); 441 early_memunmap(data, sizeof(*data));
442 442
443 switch (data_type) { 443 switch (data_type) {
444 case SETUP_E820_EXT: 444 case SETUP_E820_EXT:
@@ -470,7 +470,7 @@ static void __init e820_reserve_setup_data(void)
470 E820_RAM, E820_RESERVED_KERN); 470 E820_RAM, E820_RESERVED_KERN);
471 found = 1; 471 found = 1;
472 pa_data = data->next; 472 pa_data = data->next;
473 early_iounmap(data, sizeof(*data)); 473 early_memunmap(data, sizeof(*data));
474 } 474 }
475 if (!found) 475 if (!found)
476 return; 476 return;
@@ -491,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
491 data = early_memremap(pa_data, sizeof(*data)); 491 data = early_memremap(pa_data, sizeof(*data));
492 memblock_reserve(pa_data, sizeof(*data) + data->len); 492 memblock_reserve(pa_data, sizeof(*data) + data->len);
493 pa_data = data->next; 493 pa_data = data->next;
494 early_iounmap(data, sizeof(*data)); 494 early_memunmap(data, sizeof(*data));
495 } 495 }
496} 496}
497 497
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void)
832static int 832static int
833dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) 833dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
834{ 834{
835 pr_emerg("Kernel Offset: 0x%lx from 0x%lx " 835 if (kaslr_enabled()) {
836 "(relocation range: 0x%lx-0x%lx)\n", 836 pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
837 (unsigned long)&_text - __START_KERNEL, __START_KERNEL, 837 (unsigned long)&_text - __START_KERNEL,
838 __START_KERNEL_map, MODULES_VADDR-1); 838 __START_KERNEL,
839 __START_KERNEL_map,
840 MODULES_VADDR-1);
841 } else {
842 pr_emerg("Kernel Offset: disabled\n");
843 }
839 844
840 return 0; 845 return 0;
841} 846}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..1ea14fd53933 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,8 +61,7 @@
61 regs->seg = GET_SEG(seg) | 3; \ 61 regs->seg = GET_SEG(seg) | 3; \
62} while (0) 62} while (0)
63 63
64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
65 unsigned long *pax)
66{ 65{
67 void __user *buf; 66 void __user *buf;
68 unsigned int tmpflags; 67 unsigned int tmpflags;
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
81#endif /* CONFIG_X86_32 */ 80#endif /* CONFIG_X86_32 */
82 81
83 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 82 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
84 COPY(dx); COPY(cx); COPY(ip); 83 COPY(dx); COPY(cx); COPY(ip); COPY(ax);
85 84
86#ifdef CONFIG_X86_64 85#ifdef CONFIG_X86_64
87 COPY(r8); 86 COPY(r8);
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
94 COPY(r15); 93 COPY(r15);
95#endif /* CONFIG_X86_64 */ 94#endif /* CONFIG_X86_64 */
96 95
97#ifdef CONFIG_X86_32
98 COPY_SEG_CPL3(cs); 96 COPY_SEG_CPL3(cs);
99 COPY_SEG_CPL3(ss); 97 COPY_SEG_CPL3(ss);
100#else /* !CONFIG_X86_32 */
101 /* Kernel saves and restores only the CS segment register on signals,
102 * which is the bare minimum needed to allow mixed 32/64-bit code.
103 * App's signal handler can save/restore other segments if needed. */
104 COPY_SEG_CPL3(cs);
105#endif /* CONFIG_X86_32 */
106 98
107 get_user_ex(tmpflags, &sc->flags); 99 get_user_ex(tmpflags, &sc->flags);
108 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 100 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
109 regs->orig_ax = -1; /* disable syscall checks */ 101 regs->orig_ax = -1; /* disable syscall checks */
110 102
111 get_user_ex(buf, &sc->fpstate); 103 get_user_ex(buf, &sc->fpstate);
112
113 get_user_ex(*pax, &sc->ax);
114 } get_user_catch(err); 104 } get_user_catch(err);
115 105
116 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); 106 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
117 107
108 force_iret();
109
118 return err; 110 return err;
119} 111}
120 112
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
162#else /* !CONFIG_X86_32 */ 154#else /* !CONFIG_X86_32 */
163 put_user_ex(regs->flags, &sc->flags); 155 put_user_ex(regs->flags, &sc->flags);
164 put_user_ex(regs->cs, &sc->cs); 156 put_user_ex(regs->cs, &sc->cs);
165 put_user_ex(0, &sc->gs); 157 put_user_ex(0, &sc->__pad2);
166 put_user_ex(0, &sc->fs); 158 put_user_ex(0, &sc->__pad1);
159 put_user_ex(regs->ss, &sc->ss);
167#endif /* CONFIG_X86_32 */ 160#endif /* CONFIG_X86_32 */
168 161
169 put_user_ex(fpstate, &sc->fpstate); 162 put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
457 450
458 regs->sp = (unsigned long)frame; 451 regs->sp = (unsigned long)frame;
459 452
460 /* Set up the CS register to run signal handlers in 64-bit mode, 453 /*
461 even if the handler happens to be interrupting 32-bit code. */ 454 * Set up the CS and SS registers to run signal handlers in
455 * 64-bit mode, even if the handler happens to be interrupting
456 * 32-bit or 16-bit code.
457 *
458 * SS is subtle. In 64-bit mode, we don't need any particular
459 * SS descriptor, but we do need SS to be valid. It's possible
460 * that the old SS is entirely bogus -- this can happen if the
461 * signal we're trying to deliver is #GP or #SS caused by a bad
462 * SS value.
463 */
462 regs->cs = __USER_CS; 464 regs->cs = __USER_CS;
465 regs->ss = __USER_DS;
463 466
464 return 0; 467 return 0;
465} 468}
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void)
539{ 542{
540 struct pt_regs *regs = current_pt_regs(); 543 struct pt_regs *regs = current_pt_regs();
541 struct sigframe __user *frame; 544 struct sigframe __user *frame;
542 unsigned long ax;
543 sigset_t set; 545 sigset_t set;
544 546
545 frame = (struct sigframe __user *)(regs->sp - 8); 547 frame = (struct sigframe __user *)(regs->sp - 8);
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void)
553 555
554 set_current_blocked(&set); 556 set_current_blocked(&set);
555 557
556 if (restore_sigcontext(regs, &frame->sc, &ax)) 558 if (restore_sigcontext(regs, &frame->sc))
557 goto badframe; 559 goto badframe;
558 return ax; 560 return regs->ax;
559 561
560badframe: 562badframe:
561 signal_fault(regs, frame, "sigreturn"); 563 signal_fault(regs, frame, "sigreturn");
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void)
568{ 570{
569 struct pt_regs *regs = current_pt_regs(); 571 struct pt_regs *regs = current_pt_regs();
570 struct rt_sigframe __user *frame; 572 struct rt_sigframe __user *frame;
571 unsigned long ax;
572 sigset_t set; 573 sigset_t set;
573 574
574 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 575 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
@@ -579,37 +580,23 @@ asmlinkage long sys_rt_sigreturn(void)
579 580
580 set_current_blocked(&set); 581 set_current_blocked(&set);
581 582
582 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 583 if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
583 goto badframe; 584 goto badframe;
584 585
585 if (restore_altstack(&frame->uc.uc_stack)) 586 if (restore_altstack(&frame->uc.uc_stack))
586 goto badframe; 587 goto badframe;
587 588
588 return ax; 589 return regs->ax;
589 590
590badframe: 591badframe:
591 signal_fault(regs, frame, "rt_sigreturn"); 592 signal_fault(regs, frame, "rt_sigreturn");
592 return 0; 593 return 0;
593} 594}
594 595
595/*
596 * OK, we're invoking a handler:
597 */
598static int signr_convert(int sig)
599{
600#ifdef CONFIG_X86_32
601 struct thread_info *info = current_thread_info();
602
603 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
604 return info->exec_domain->signal_invmap[sig];
605#endif /* CONFIG_X86_32 */
606 return sig;
607}
608
609static int 596static int
610setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) 597setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
611{ 598{
612 int usig = signr_convert(ksig->sig); 599 int usig = ksig->sig;
613 sigset_t *set = sigmask_to_save(); 600 sigset_t *set = sigmask_to_save();
614 compat_sigset_t *cset = (compat_sigset_t *) set; 601 compat_sigset_t *cset = (compat_sigset_t *) set;
615 602
@@ -629,7 +616,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
629static void 616static void
630handle_signal(struct ksignal *ksig, struct pt_regs *regs) 617handle_signal(struct ksignal *ksig, struct pt_regs *regs)
631{ 618{
632 bool failed; 619 bool stepping, failed;
620
633 /* Are we from a system call? */ 621 /* Are we from a system call? */
634 if (syscall_get_nr(current, regs) >= 0) { 622 if (syscall_get_nr(current, regs) >= 0) {
635 /* If so, check system call restarting.. */ 623 /* If so, check system call restarting.. */
@@ -653,12 +641,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
653 } 641 }
654 642
655 /* 643 /*
656 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF 644 * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
657 * flag so that register information in the sigcontext is correct. 645 * so that register information in the sigcontext is correct and
646 * then notify the tracer before entering the signal handler.
658 */ 647 */
659 if (unlikely(regs->flags & X86_EFLAGS_TF) && 648 stepping = test_thread_flag(TIF_SINGLESTEP);
660 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 649 if (stepping)
661 regs->flags &= ~X86_EFLAGS_TF; 650 user_disable_single_step(current);
662 651
663 failed = (setup_rt_frame(ksig, regs) < 0); 652 failed = (setup_rt_frame(ksig, regs) < 0);
664 if (!failed) { 653 if (!failed) {
@@ -669,19 +658,17 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
669 * it might disable possible debug exception from the 658 * it might disable possible debug exception from the
670 * signal handler. 659 * signal handler.
671 * 660 *
672 * Clear TF when entering the signal handler, but 661 * Clear TF for the case when it wasn't set by debugger to
673 * notify any tracer that was single-stepping it. 662 * avoid the recursive send_sigtrap() in SIGTRAP handler.
674 * The tracer may want to single-step inside the
675 * handler too.
676 */ 663 */
677 regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); 664 regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
678 /* 665 /*
679 * Ensure the signal handler starts with the new fpu state. 666 * Ensure the signal handler starts with the new fpu state.
680 */ 667 */
681 if (used_math()) 668 if (used_math())
682 drop_init_fpu(current); 669 fpu_reset_state(current);
683 } 670 }
684 signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); 671 signal_setup_done(failed, ksig, stepping);
685} 672}
686 673
687#ifdef CONFIG_X86_32 674#ifdef CONFIG_X86_32
@@ -780,7 +767,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
780 struct pt_regs *regs = current_pt_regs(); 767 struct pt_regs *regs = current_pt_regs();
781 struct rt_sigframe_x32 __user *frame; 768 struct rt_sigframe_x32 __user *frame;
782 sigset_t set; 769 sigset_t set;
783 unsigned long ax;
784 770
785 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); 771 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
786 772
@@ -791,13 +777,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
791 777
792 set_current_blocked(&set); 778 set_current_blocked(&set);
793 779
794 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 780 if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
795 goto badframe; 781 goto badframe;
796 782
797 if (compat_restore_altstack(&frame->uc.uc_stack)) 783 if (compat_restore_altstack(&frame->uc.uc_stack))
798 goto badframe; 784 goto badframe;
799 785
800 return ax; 786 return regs->ax;
801 787
802badframe: 788badframe:
803 signal_fault(regs, frame, "x32 rt_sigreturn"); 789 signal_fault(regs, frame, "x32 rt_sigreturn");
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..50e547eac8cd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
77#include <asm/realmode.h> 77#include <asm/realmode.h>
78#include <asm/misc.h> 78#include <asm/misc.h>
79 79
80/* State of each CPU */
81DEFINE_PER_CPU(int, cpu_state) = { 0 };
82
83/* Number of siblings per CPU package */ 80/* Number of siblings per CPU package */
84int smp_num_siblings = 1; 81int smp_num_siblings = 1;
85EXPORT_SYMBOL(smp_num_siblings); 82EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
257 lock_vector_lock(); 254 lock_vector_lock();
258 set_cpu_online(smp_processor_id(), true); 255 set_cpu_online(smp_processor_id(), true);
259 unlock_vector_lock(); 256 unlock_vector_lock();
260 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 257 cpu_set_state_online(smp_processor_id());
261 x86_platform.nmi_init(); 258 x86_platform.nmi_init();
262 259
263 /* enable local interrupts */ 260 /* enable local interrupts */
@@ -779,6 +776,26 @@ out:
779 return boot_error; 776 return boot_error;
780} 777}
781 778
779void common_cpu_up(unsigned int cpu, struct task_struct *idle)
780{
781 /* Just in case we booted with a single CPU. */
782 alternatives_enable_smp();
783
784 per_cpu(current_task, cpu) = idle;
785
786#ifdef CONFIG_X86_32
787 /* Stack for startup_32 can be just as for start_secondary onwards */
788 irq_ctx_init(cpu);
789 per_cpu(cpu_current_top_of_stack, cpu) =
790 (unsigned long)task_stack_page(idle) + THREAD_SIZE;
791#else
792 clear_tsk_thread_flag(idle, TIF_FORK);
793 initial_gs = per_cpu_offset(cpu);
794#endif
795 per_cpu(kernel_stack, cpu) =
796 (unsigned long)task_stack_page(idle) + THREAD_SIZE;
797}
798
782/* 799/*
783 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 800 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
784 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 801 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -796,23 +813,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
796 int cpu0_nmi_registered = 0; 813 int cpu0_nmi_registered = 0;
797 unsigned long timeout; 814 unsigned long timeout;
798 815
799 /* Just in case we booted with a single CPU. */
800 alternatives_enable_smp();
801
802 idle->thread.sp = (unsigned long) (((struct pt_regs *) 816 idle->thread.sp = (unsigned long) (((struct pt_regs *)
803 (THREAD_SIZE + task_stack_page(idle))) - 1); 817 (THREAD_SIZE + task_stack_page(idle))) - 1);
804 per_cpu(current_task, cpu) = idle;
805 818
806#ifdef CONFIG_X86_32
807 /* Stack for startup_32 can be just as for start_secondary onwards */
808 irq_ctx_init(cpu);
809#else
810 clear_tsk_thread_flag(idle, TIF_FORK);
811 initial_gs = per_cpu_offset(cpu);
812#endif
813 per_cpu(kernel_stack, cpu) =
814 (unsigned long)task_stack_page(idle) -
815 KERNEL_STACK_OFFSET + THREAD_SIZE;
816 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 819 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
817 initial_code = (unsigned long)start_secondary; 820 initial_code = (unsigned long)start_secondary;
818 stack_start = idle->thread.sp; 821 stack_start = idle->thread.sp;
@@ -948,11 +951,16 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
948 */ 951 */
949 mtrr_save_state(); 952 mtrr_save_state();
950 953
951 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 954 /* x86 CPUs take themselves offline, so delayed offline is OK. */
955 err = cpu_check_up_prepare(cpu);
956 if (err && err != -EBUSY)
957 return err;
952 958
953 /* the FPU context is blank, nobody can own it */ 959 /* the FPU context is blank, nobody can own it */
954 __cpu_disable_lazy_restore(cpu); 960 __cpu_disable_lazy_restore(cpu);
955 961
962 common_cpu_up(cpu, tidle);
963
956 err = do_boot_cpu(apicid, cpu, tidle); 964 err = do_boot_cpu(apicid, cpu, tidle);
957 if (err) { 965 if (err) {
958 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); 966 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
@@ -1086,8 +1094,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
1086 return SMP_NO_APIC; 1094 return SMP_NO_APIC;
1087 } 1095 }
1088 1096
1089 verify_local_APIC();
1090
1091 /* 1097 /*
1092 * If SMP should be disabled, then really disable it! 1098 * If SMP should be disabled, then really disable it!
1093 */ 1099 */
@@ -1191,7 +1197,7 @@ void __init native_smp_prepare_boot_cpu(void)
1191 switch_to_new_gdt(me); 1197 switch_to_new_gdt(me);
1192 /* already set me in cpu_online_mask in boot_cpu_init() */ 1198 /* already set me in cpu_online_mask in boot_cpu_init() */
1193 cpumask_set_cpu(me, cpu_callout_mask); 1199 cpumask_set_cpu(me, cpu_callout_mask);
1194 per_cpu(cpu_state, me) = CPU_ONLINE; 1200 cpu_set_state_online(me);
1195} 1201}
1196 1202
1197void __init native_smp_cpus_done(unsigned int max_cpus) 1203void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1324,10 @@ static void __ref remove_cpu_from_maps(int cpu)
1318 numa_remove_cpu(cpu); 1324 numa_remove_cpu(cpu);
1319} 1325}
1320 1326
1321static DEFINE_PER_CPU(struct completion, die_complete);
1322
1323void cpu_disable_common(void) 1327void cpu_disable_common(void)
1324{ 1328{
1325 int cpu = smp_processor_id(); 1329 int cpu = smp_processor_id();
1326 1330
1327 init_completion(&per_cpu(die_complete, smp_processor_id()));
1328
1329 remove_siblinginfo(cpu); 1331 remove_siblinginfo(cpu);
1330 1332
1331 /* It's now safe to remove this processor from the online map */ 1333 /* It's now safe to remove this processor from the online map */
@@ -1349,24 +1351,27 @@ int native_cpu_disable(void)
1349 return 0; 1351 return 0;
1350} 1352}
1351 1353
1352void cpu_die_common(unsigned int cpu) 1354int common_cpu_die(unsigned int cpu)
1353{ 1355{
1354 wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); 1356 int ret = 0;
1355}
1356 1357
1357void native_cpu_die(unsigned int cpu)
1358{
1359 /* We don't do anything here: idle task is faking death itself. */ 1358 /* We don't do anything here: idle task is faking death itself. */
1360 1359
1361 cpu_die_common(cpu);
1362
1363 /* They ack this in play_dead() by setting CPU_DEAD */ 1360 /* They ack this in play_dead() by setting CPU_DEAD */
1364 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1361 if (cpu_wait_death(cpu, 5)) {
1365 if (system_state == SYSTEM_RUNNING) 1362 if (system_state == SYSTEM_RUNNING)
1366 pr_info("CPU %u is now offline\n", cpu); 1363 pr_info("CPU %u is now offline\n", cpu);
1367 } else { 1364 } else {
1368 pr_err("CPU %u didn't die...\n", cpu); 1365 pr_err("CPU %u didn't die...\n", cpu);
1366 ret = -1;
1369 } 1367 }
1368
1369 return ret;
1370}
1371
1372void native_cpu_die(unsigned int cpu)
1373{
1374 common_cpu_die(cpu);
1370} 1375}
1371 1376
1372void play_dead_common(void) 1377void play_dead_common(void)
@@ -1375,10 +1380,8 @@ void play_dead_common(void)
1375 reset_lazy_tlbstate(); 1380 reset_lazy_tlbstate();
1376 amd_e400_remove_cpu(raw_smp_processor_id()); 1381 amd_e400_remove_cpu(raw_smp_processor_id());
1377 1382
1378 mb();
1379 /* Ack it */ 1383 /* Ack it */
1380 __this_cpu_write(cpu_state, CPU_DEAD); 1384 (void)cpu_report_death();
1381 complete(&per_cpu(die_complete, smp_processor_id()));
1382 1385
1383 /* 1386 /*
1384 * With physical CPU hotplug, we should halt the cpu 1387 * With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 30277e27431a..10e0272d789a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -34,10 +34,26 @@ static unsigned long get_align_mask(void)
34 return va_align.mask; 34 return va_align.mask;
35} 35}
36 36
37/*
38 * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
39 * va_align.bits, [12:upper_bit), are set to a random value instead of
40 * zeroing them. This random value is computed once per boot. This form
41 * of ASLR is known as "per-boot ASLR".
42 *
43 * To achieve this, the random value is added to the info.align_offset
44 * value before calling vm_unmapped_area() or ORed directly to the
45 * address.
46 */
47static unsigned long get_align_bits(void)
48{
49 return va_align.bits & get_align_mask();
50}
51
37unsigned long align_vdso_addr(unsigned long addr) 52unsigned long align_vdso_addr(unsigned long addr)
38{ 53{
39 unsigned long align_mask = get_align_mask(); 54 unsigned long align_mask = get_align_mask();
40 return (addr + align_mask) & ~align_mask; 55 addr = (addr + align_mask) & ~align_mask;
56 return addr | get_align_bits();
41} 57}
42 58
43static int __init control_va_addr_alignment(char *str) 59static int __init control_va_addr_alignment(char *str)
@@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
135 info.length = len; 151 info.length = len;
136 info.low_limit = begin; 152 info.low_limit = begin;
137 info.high_limit = end; 153 info.high_limit = end;
138 info.align_mask = filp ? get_align_mask() : 0; 154 info.align_mask = 0;
139 info.align_offset = pgoff << PAGE_SHIFT; 155 info.align_offset = pgoff << PAGE_SHIFT;
156 if (filp) {
157 info.align_mask = get_align_mask();
158 info.align_offset += get_align_bits();
159 }
140 return vm_unmapped_area(&info); 160 return vm_unmapped_area(&info);
141} 161}
142 162
@@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
174 info.length = len; 194 info.length = len;
175 info.low_limit = PAGE_SIZE; 195 info.low_limit = PAGE_SIZE;
176 info.high_limit = mm->mmap_base; 196 info.high_limit = mm->mmap_base;
177 info.align_mask = filp ? get_align_mask() : 0; 197 info.align_mask = 0;
178 info.align_offset = pgoff << PAGE_SHIFT; 198 info.align_offset = pgoff << PAGE_SHIFT;
199 if (filp) {
200 info.align_mask = get_align_mask();
201 info.align_offset += get_align_bits();
202 }
179 addr = vm_unmapped_area(&info); 203 addr = vm_unmapped_area(&info);
180 if (!(addr & ~PAGE_MASK)) 204 if (!(addr & ~PAGE_MASK))
181 return addr; 205 return addr;
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57d8a9e..3777189c4a19 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; 8#ifdef CONFIG_IA32_EMULATION
9#define SYM(sym, compat) compat
10#else
11#define SYM(sym, compat) sym
12#define ia32_sys_call_table sys_call_table
13#define __NR_ia32_syscall_max __NR_syscall_max
14#endif
15
16#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
9#include <asm/syscalls_32.h> 17#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386 18#undef __SYSCALL_I386
11 19
12#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, 20#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
13 21
14typedef asmlinkage void (*sys_call_ptr_t)(void); 22typedef asmlinkage void (*sys_call_ptr_t)(void);
15 23
16extern asmlinkage void sys_ni_syscall(void); 24extern asmlinkage void sys_ni_syscall(void);
17 25
18__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 26__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /* 27 /*
20 * Smells like a compiler bug -- it doesn't work 28 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed. 29 * when the & below is removed.
22 */ 30 */
23 [0 ... __NR_syscall_max] = &sys_ni_syscall, 31 [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
24#include <asm/syscalls_32.h> 32#include <asm/syscalls_32.h>
25}; 33};
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133abda48..5ecbfe5099da 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -57,7 +57,7 @@ int rodata_test(void)
57 /* test 3: check the value hasn't changed */ 57 /* test 3: check the value hasn't changed */
58 /* If this test fails, we managed to overwrite the data */ 58 /* If this test fails, we managed to overwrite the data */
59 if (!rodata_test_data) { 59 if (!rodata_test_data) {
60 printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); 60 printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
61 return -ENODEV; 61 return -ENODEV;
62 } 62 }
63 /* test 4: check if the rodata section is 4Kb aligned */ 63 /* test 4: check if the rodata section is 4Kb aligned */
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25adc0e16eaa..d39c09119db6 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs)
30{ 30{
31 unsigned long pc = instruction_pointer(regs); 31 unsigned long pc = instruction_pointer(regs);
32 32
33 if (!user_mode_vm(regs) && in_lock_functions(pc)) { 33 if (!user_mode(regs) && in_lock_functions(pc)) {
34#ifdef CONFIG_FRAME_POINTER 34#ifdef CONFIG_FRAME_POINTER
35 return *(unsigned long *)(regs->bp + sizeof(long)); 35 return *(unsigned long *)(regs->bp + sizeof(long));
36#else 36#else
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4ff5d162ff9f..324ab5247687 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
112{ 112{
113 enum ctx_state prev_state; 113 enum ctx_state prev_state;
114 114
115 if (user_mode_vm(regs)) { 115 if (user_mode(regs)) {
116 /* Other than that, we're just an exception. */ 116 /* Other than that, we're just an exception. */
117 prev_state = exception_enter(); 117 prev_state = exception_enter();
118 } else { 118 } else {
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
123 * but we need to notify RCU. 123 * but we need to notify RCU.
124 */ 124 */
125 rcu_nmi_enter(); 125 rcu_nmi_enter();
126 prev_state = IN_KERNEL; /* the value is irrelevant. */ 126 prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */
127 } 127 }
128 128
129 /* 129 /*
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
146 /* Must be before exception_exit. */ 146 /* Must be before exception_exit. */
147 preempt_count_sub(HARDIRQ_OFFSET); 147 preempt_count_sub(HARDIRQ_OFFSET);
148 148
149 if (user_mode_vm(regs)) 149 if (user_mode(regs))
150 return exception_exit(prev_state); 150 return exception_exit(prev_state);
151 else 151 else
152 rcu_nmi_exit(); 152 rcu_nmi_exit();
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
158 * 158 *
159 * IST exception handlers normally cannot schedule. As a special 159 * IST exception handlers normally cannot schedule. As a special
160 * exception, if the exception interrupted userspace code (i.e. 160 * exception, if the exception interrupted userspace code (i.e.
161 * user_mode_vm(regs) would return true) and the exception was not 161 * user_mode(regs) would return true) and the exception was not
162 * a double fault, it can be safe to schedule. ist_begin_non_atomic() 162 * a double fault, it can be safe to schedule. ist_begin_non_atomic()
163 * begins a non-atomic section within an ist_enter()/ist_exit() region. 163 * begins a non-atomic section within an ist_enter()/ist_exit() region.
164 * Callers are responsible for enabling interrupts themselves inside 164 * Callers are responsible for enabling interrupts themselves inside
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
167 */ 167 */
168void ist_begin_non_atomic(struct pt_regs *regs) 168void ist_begin_non_atomic(struct pt_regs *regs)
169{ 169{
170 BUG_ON(!user_mode_vm(regs)); 170 BUG_ON(!user_mode(regs));
171 171
172 /* 172 /*
173 * Sanity check: we need to be on the normal thread stack. This 173 * Sanity check: we need to be on the normal thread stack. This
174 * will catch asm bugs and any attempt to use ist_preempt_enable 174 * will catch asm bugs and any attempt to use ist_preempt_enable
175 * from double_fault. 175 * from double_fault.
176 */ 176 */
177 BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) 177 BUG_ON((unsigned long)(current_top_of_stack() -
178 & ~(THREAD_SIZE - 1)) != 0); 178 current_stack_pointer()) >= THREAD_SIZE);
179 179
180 preempt_count_sub(HARDIRQ_OFFSET); 180 preempt_count_sub(HARDIRQ_OFFSET);
181} 181}
@@ -194,8 +194,7 @@ static nokprobe_inline int
194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, 194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
195 struct pt_regs *regs, long error_code) 195 struct pt_regs *regs, long error_code)
196{ 196{
197#ifdef CONFIG_X86_32 197 if (v8086_mode(regs)) {
198 if (regs->flags & X86_VM_MASK) {
199 /* 198 /*
200 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. 199 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
201 * On nmi (interrupt 2), do_trap should not be called. 200 * On nmi (interrupt 2), do_trap should not be called.
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
207 } 206 }
208 return -1; 207 return -1;
209 } 208 }
210#endif 209
211 if (!user_mode(regs)) { 210 if (!user_mode(regs)) {
212 if (!fixup_exception(regs)) { 211 if (!fixup_exception(regs)) {
213 tsk->thread.error_code = error_code; 212 tsk->thread.error_code = error_code;
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
384 goto exit; 383 goto exit;
385 conditional_sti(regs); 384 conditional_sti(regs);
386 385
387 if (!user_mode_vm(regs)) 386 if (!user_mode(regs))
388 die("bounds", regs, error_code); 387 die("bounds", regs, error_code);
389 388
390 if (!cpu_feature_enabled(X86_FEATURE_MPX)) { 389 if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
462 prev_state = exception_enter(); 461 prev_state = exception_enter();
463 conditional_sti(regs); 462 conditional_sti(regs);
464 463
465#ifdef CONFIG_X86_32 464 if (v8086_mode(regs)) {
466 if (regs->flags & X86_VM_MASK) {
467 local_irq_enable(); 465 local_irq_enable();
468 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 466 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
469 goto exit; 467 goto exit;
470 } 468 }
471#endif
472 469
473 tsk = current; 470 tsk = current;
474 if (!user_mode(regs)) { 471 if (!user_mode(regs)) {
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
587 /* Copy the remainder of the stack from the current stack. */ 584 /* Copy the remainder of the stack from the current stack. */
588 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); 585 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
589 586
590 BUG_ON(!user_mode_vm(&new_stack->regs)); 587 BUG_ON(!user_mode(&new_stack->regs));
591 return new_stack; 588 return new_stack;
592} 589}
593NOKPROBE_SYMBOL(fixup_bad_iret); 590NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
637 * then it's very likely the result of an icebp/int01 trap. 634 * then it's very likely the result of an icebp/int01 trap.
638 * User wants a sigtrap for that. 635 * User wants a sigtrap for that.
639 */ 636 */
640 if (!dr6 && user_mode_vm(regs)) 637 if (!dr6 && user_mode(regs))
641 user_icebp = 1; 638 user_icebp = 1;
642 639
643 /* Catch kmemcheck conditions first of all! */ 640 /* Catch kmemcheck conditions first of all! */
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
673 /* It's safe to allow irq's after DR6 has been saved */ 670 /* It's safe to allow irq's after DR6 has been saved */
674 preempt_conditional_sti(regs); 671 preempt_conditional_sti(regs);
675 672
676 if (regs->flags & X86_VM_MASK) { 673 if (v8086_mode(regs)) {
677 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 674 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
678 X86_TRAP_DB); 675 X86_TRAP_DB);
679 preempt_conditional_cli(regs); 676 preempt_conditional_cli(regs);
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
721 return; 718 return;
722 conditional_sti(regs); 719 conditional_sti(regs);
723 720
724 if (!user_mode_vm(regs)) 721 if (!user_mode(regs))
725 { 722 {
726 if (!fixup_exception(regs)) { 723 if (!fixup_exception(regs)) {
727 task->thread.error_code = error_code; 724 task->thread.error_code = error_code;
@@ -734,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
734 /* 731 /*
735 * Save the info for the exception handler and clear the error. 732 * Save the info for the exception handler and clear the error.
736 */ 733 */
737 save_init_fpu(task); 734 unlazy_fpu(task);
738 task->thread.trap_nr = trapnr; 735 task->thread.trap_nr = trapnr;
739 task->thread.error_code = error_code; 736 task->thread.error_code = error_code;
740 info.si_signo = SIGFPE; 737 info.si_signo = SIGFPE;
@@ -863,7 +860,7 @@ void math_state_restore(void)
863 kernel_fpu_disable(); 860 kernel_fpu_disable();
864 __thread_fpu_begin(tsk); 861 __thread_fpu_begin(tsk);
865 if (unlikely(restore_fpu_checking(tsk))) { 862 if (unlikely(restore_fpu_checking(tsk))) {
866 drop_init_fpu(tsk); 863 fpu_reset_state(tsk);
867 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); 864 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
868 } else { 865 } else {
869 tsk->thread.fpu_counter++; 866 tsk->thread.fpu_counter++;
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
925/* Set of traps needed for early debugging. */ 922/* Set of traps needed for early debugging. */
926void __init early_trap_init(void) 923void __init early_trap_init(void)
927{ 924{
928 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 925 /*
926 * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
927 * is ready in cpu_init() <-- trap_init(). Before trap_init(),
928 * CPU runs at ring 0 so it is impossible to hit an invalid
929 * stack. Using the original stack works well enough at this
930 * early stage. DEBUG_STACK will be equipped after cpu_init() in
931 * trap_init().
932 *
933 * We don't need to set trace_idt_table like set_intr_gate(),
934 * since we don't have trace_debug and it will be reset to
935 * 'debug' in trap_init() by set_intr_gate_ist().
936 */
937 set_intr_gate_notrace(X86_TRAP_DB, debug);
929 /* int3 can be called from all */ 938 /* int3 can be called from all */
930 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 939 set_system_intr_gate(X86_TRAP_BP, &int3);
931#ifdef CONFIG_X86_32 940#ifdef CONFIG_X86_32
932 set_intr_gate(X86_TRAP_PF, page_fault); 941 set_intr_gate(X86_TRAP_PF, page_fault);
933#endif 942#endif
@@ -1005,6 +1014,15 @@ void __init trap_init(void)
1005 */ 1014 */
1006 cpu_init(); 1015 cpu_init();
1007 1016
1017 /*
1018 * X86_TRAP_DB and X86_TRAP_BP have been set
1019 * in early_trap_init(). However, ITS works only after
1020 * cpu_init() loads TSS. See comments in early_trap_init().
1021 */
1022 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
1023 /* int3 can be called from all */
1024 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
1025
1008 x86_init.irqs.trap_init(); 1026 x86_init.irqs.trap_init();
1009 1027
1010#ifdef CONFIG_X86_64 1028#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 81f8adb0679e..0b81ad67da07 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
912 int ret = NOTIFY_DONE; 912 int ret = NOTIFY_DONE;
913 913
914 /* We are only interested in userspace traps */ 914 /* We are only interested in userspace traps */
915 if (regs && !user_mode_vm(regs)) 915 if (regs && !user_mode(regs))
916 return NOTIFY_DONE; 916 return NOTIFY_DONE;
917 917
918 switch (val) { 918 switch (val) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf52e069..fc9db6ef2a95 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
150 do_exit(SIGSEGV); 150 do_exit(SIGSEGV);
151 } 151 }
152 152
153 tss = &per_cpu(init_tss, get_cpu()); 153 tss = &per_cpu(cpu_tss, get_cpu());
154 current->thread.sp0 = current->thread.saved_sp0; 154 current->thread.sp0 = current->thread.saved_sp0;
155 current->thread.sysenter_cs = __KERNEL_CS; 155 current->thread.sysenter_cs = __KERNEL_CS;
156 load_sp0(tss, &current->thread); 156 load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 tsk->thread.saved_fs = info->regs32->fs; 318 tsk->thread.saved_fs = info->regs32->fs;
319 tsk->thread.saved_gs = get_user_gs(info->regs32); 319 tsk->thread.saved_gs = get_user_gs(info->regs32);
320 320
321 tss = &per_cpu(init_tss, get_cpu()); 321 tss = &per_cpu(cpu_tss, get_cpu());
322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
323 if (cpu_has_sep) 323 if (cpu_has_sep)
324 tsk->thread.sysenter_cs = 0; 324 tsk->thread.sysenter_cs = 0;
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index c7d791f32b98..51e330416995 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
31 gtod_write_begin(vdata); 31 gtod_write_begin(vdata);
32 32
33 /* copy vsyscall data */ 33 /* copy vsyscall data */
34 vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode; 34 vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
35 vdata->cycle_last = tk->tkr.cycle_last; 35 vdata->cycle_last = tk->tkr_mono.cycle_last;
36 vdata->mask = tk->tkr.mask; 36 vdata->mask = tk->tkr_mono.mask;
37 vdata->mult = tk->tkr.mult; 37 vdata->mult = tk->tkr_mono.mult;
38 vdata->shift = tk->tkr.shift; 38 vdata->shift = tk->tkr_mono.shift;
39 39
40 vdata->wall_time_sec = tk->xtime_sec; 40 vdata->wall_time_sec = tk->xtime_sec;
41 vdata->wall_time_snsec = tk->tkr.xtime_nsec; 41 vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
42 42
43 vdata->monotonic_time_sec = tk->xtime_sec 43 vdata->monotonic_time_sec = tk->xtime_sec
44 + tk->wall_to_monotonic.tv_sec; 44 + tk->wall_to_monotonic.tv_sec;
45 vdata->monotonic_time_snsec = tk->tkr.xtime_nsec 45 vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
46 + ((u64)tk->wall_to_monotonic.tv_nsec 46 + ((u64)tk->wall_to_monotonic.tv_nsec
47 << tk->tkr.shift); 47 << tk->tkr_mono.shift);
48 while (vdata->monotonic_time_snsec >= 48 while (vdata->monotonic_time_snsec >=
49 (((u64)NSEC_PER_SEC) << tk->tkr.shift)) { 49 (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
50 vdata->monotonic_time_snsec -= 50 vdata->monotonic_time_snsec -=
51 ((u64)NSEC_PER_SEC) << tk->tkr.shift; 51 ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
52 vdata->monotonic_time_sec++; 52 vdata->monotonic_time_sec++;
53 } 53 }
54 54
55 vdata->wall_time_coarse_sec = tk->xtime_sec; 55 vdata->wall_time_coarse_sec = tk->xtime_sec;
56 vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >> 56 vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
57 tk->tkr.shift); 57 tk->tkr_mono.shift);
58 58
59 vdata->monotonic_time_coarse_sec = 59 vdata->monotonic_time_coarse_sec =
60 vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; 60 vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index cdc6cf903078..87a815b85f3e 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
342 config_enabled(CONFIG_IA32_EMULATION)); 342 config_enabled(CONFIG_IA32_EMULATION));
343 343
344 if (!buf) { 344 if (!buf) {
345 drop_init_fpu(tsk); 345 fpu_reset_state(tsk);
346 return 0; 346 return 0;
347 } 347 }
348 348
@@ -416,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
416 */ 416 */
417 user_fpu_begin(); 417 user_fpu_begin();
418 if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { 418 if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
419 drop_init_fpu(tsk); 419 fpu_reset_state(tsk);
420 return -1; 420 return -1;
421 } 421 }
422 } 422 }
@@ -678,19 +678,13 @@ void xsave_init(void)
678 this_func(); 678 this_func();
679} 679}
680 680
681static inline void __init eager_fpu_init_bp(void) 681/*
682{ 682 * setup_init_fpu_buf() is __init and it is OK to call it here because
683 current->thread.fpu.state = 683 * init_xstate_buf will be unset only once during boot.
684 alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); 684 */
685 if (!init_xstate_buf) 685void __init_refok eager_fpu_init(void)
686 setup_init_fpu_buf();
687}
688
689void eager_fpu_init(void)
690{ 686{
691 static __refdata void (*boot_func)(void) = eager_fpu_init_bp; 687 WARN_ON(used_math());
692
693 clear_used_math();
694 current_thread_info()->status = 0; 688 current_thread_info()->status = 0;
695 689
696 if (eagerfpu == ENABLE) 690 if (eagerfpu == ENABLE)
@@ -701,21 +695,8 @@ void eager_fpu_init(void)
701 return; 695 return;
702 } 696 }
703 697
704 if (boot_func) { 698 if (!init_xstate_buf)
705 boot_func(); 699 setup_init_fpu_buf();
706 boot_func = NULL;
707 }
708
709 /*
710 * This is same as math_state_restore(). But use_xsave() is
711 * not yet patched to use math_state_restore().
712 */
713 init_fpu(current);
714 __thread_fpu_begin(current);
715 if (cpu_has_xsave)
716 xrstor_state(init_xstate_buf, -1);
717 else
718 fxrstor_checking(&init_xstate_buf->i387);
719} 700}
720 701
721/* 702/*
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 08f790dfadc9..16e8f962eaad 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a80737ee6e6..59b69f6a2844 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
104 ((best->eax & 0xff00) >> 8) != 0) 104 ((best->eax & 0xff00) >> 8) != 0)
105 return -EINVAL; 105 return -EINVAL;
106 106
107 /* Update physical-address width */
108 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
109
107 kvm_pmu_cpuid_update(vcpu); 110 kvm_pmu_cpuid_update(vcpu);
108 return 0; 111 return 0;
109} 112}
@@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
135 } 138 }
136} 139}
137 140
141int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
142{
143 struct kvm_cpuid_entry2 *best;
144
145 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
146 if (!best || best->eax < 0x80000008)
147 goto not_found;
148 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
149 if (best)
150 return best->eax & 0xff;
151not_found:
152 return 36;
153}
154EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
155
138/* when an old userspace process fills a new kernel module */ 156/* when an old userspace process fills a new kernel module */
139int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 157int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
140 struct kvm_cpuid *cpuid, 158 struct kvm_cpuid *cpuid,
@@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
757} 775}
758EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); 776EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
759 777
760int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
761{
762 struct kvm_cpuid_entry2 *best;
763
764 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
765 if (!best || best->eax < 0x80000008)
766 goto not_found;
767 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
768 if (best)
769 return best->eax & 0xff;
770not_found:
771 return 36;
772}
773EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
774
775/* 778/*
776 * If no match is found, check whether we exceed the vCPU's limit 779 * If no match is found, check whether we exceed the vCPU's limit
777 * and return the content of the highest valid _standard_ leaf instead. 780 * and return the content of the highest valid _standard_ leaf instead.
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 4452eedfaedd..c3b1ad9fca81 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -20,13 +20,19 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
20 struct kvm_cpuid_entry2 __user *entries); 20 struct kvm_cpuid_entry2 __user *entries);
21void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 21void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
22 22
23int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
24
25static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
26{
27 return vcpu->arch.maxphyaddr;
28}
23 29
24static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 30static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
25{ 31{
26 struct kvm_cpuid_entry2 *best; 32 struct kvm_cpuid_entry2 *best;
27 33
28 if (!static_cpu_has(X86_FEATURE_XSAVE)) 34 if (!static_cpu_has(X86_FEATURE_XSAVE))
29 return 0; 35 return false;
30 36
31 best = kvm_find_cpuid_entry(vcpu, 1, 0); 37 best = kvm_find_cpuid_entry(vcpu, 1, 0);
32 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 38 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 106c01557f2b..630bcb0d7a04 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -248,27 +248,7 @@ struct mode_dual {
248 struct opcode mode64; 248 struct opcode mode64;
249}; 249};
250 250
251/* EFLAGS bit definitions. */
252#define EFLG_ID (1<<21)
253#define EFLG_VIP (1<<20)
254#define EFLG_VIF (1<<19)
255#define EFLG_AC (1<<18)
256#define EFLG_VM (1<<17)
257#define EFLG_RF (1<<16)
258#define EFLG_IOPL (3<<12)
259#define EFLG_NT (1<<14)
260#define EFLG_OF (1<<11)
261#define EFLG_DF (1<<10)
262#define EFLG_IF (1<<9)
263#define EFLG_TF (1<<8)
264#define EFLG_SF (1<<7)
265#define EFLG_ZF (1<<6)
266#define EFLG_AF (1<<4)
267#define EFLG_PF (1<<2)
268#define EFLG_CF (1<<0)
269
270#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a 251#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
271#define EFLG_RESERVED_ONE_MASK 2
272 252
273enum x86_transfer_type { 253enum x86_transfer_type {
274 X86_TRANSFER_NONE, 254 X86_TRANSFER_NONE,
@@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
317 * These EFLAGS bits are restored from saved value during emulation, and 297 * These EFLAGS bits are restored from saved value during emulation, and
318 * any changes are written back to the saved value after emulation. 298 * any changes are written back to the saved value after emulation.
319 */ 299 */
320#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) 300#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
301 X86_EFLAGS_PF|X86_EFLAGS_CF)
321 302
322#ifdef CONFIG_X86_64 303#ifdef CONFIG_X86_64
323#define ON64(x) x 304#define ON64(x) x
@@ -478,6 +459,25 @@ static void assign_masked(ulong *dest, ulong src, ulong mask)
478 *dest = (*dest & ~mask) | (src & mask); 459 *dest = (*dest & ~mask) | (src & mask);
479} 460}
480 461
462static void assign_register(unsigned long *reg, u64 val, int bytes)
463{
464 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
465 switch (bytes) {
466 case 1:
467 *(u8 *)reg = (u8)val;
468 break;
469 case 2:
470 *(u16 *)reg = (u16)val;
471 break;
472 case 4:
473 *reg = (u32)val;
474 break; /* 64b: zero-extend */
475 case 8:
476 *reg = val;
477 break;
478 }
479}
480
481static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) 481static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
482{ 482{
483 return (1UL << (ctxt->ad_bytes << 3)) - 1; 483 return (1UL << (ctxt->ad_bytes << 3)) - 1;
@@ -943,6 +943,22 @@ FASTOP2(xadd);
943 943
944FASTOP2R(cmp, cmp_r); 944FASTOP2R(cmp, cmp_r);
945 945
946static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
947{
948 /* If src is zero, do not writeback, but update flags */
949 if (ctxt->src.val == 0)
950 ctxt->dst.type = OP_NONE;
951 return fastop(ctxt, em_bsf);
952}
953
954static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
955{
956 /* If src is zero, do not writeback, but update flags */
957 if (ctxt->src.val == 0)
958 ctxt->dst.type = OP_NONE;
959 return fastop(ctxt, em_bsr);
960}
961
946static u8 test_cc(unsigned int condition, unsigned long flags) 962static u8 test_cc(unsigned int condition, unsigned long flags)
947{ 963{
948 u8 rc; 964 u8 rc;
@@ -1399,7 +1415,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1399 unsigned int in_page, n; 1415 unsigned int in_page, n;
1400 unsigned int count = ctxt->rep_prefix ? 1416 unsigned int count = ctxt->rep_prefix ?
1401 address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; 1417 address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
1402 in_page = (ctxt->eflags & EFLG_DF) ? 1418 in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
1403 offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : 1419 offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
1404 PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); 1420 PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
1405 n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); 1421 n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
@@ -1412,7 +1428,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1412 } 1428 }
1413 1429
1414 if (ctxt->rep_prefix && (ctxt->d & String) && 1430 if (ctxt->rep_prefix && (ctxt->d & String) &&
1415 !(ctxt->eflags & EFLG_DF)) { 1431 !(ctxt->eflags & X86_EFLAGS_DF)) {
1416 ctxt->dst.data = rc->data + rc->pos; 1432 ctxt->dst.data = rc->data + rc->pos;
1417 ctxt->dst.type = OP_MEM_STR; 1433 ctxt->dst.type = OP_MEM_STR;
1418 ctxt->dst.count = (rc->end - rc->pos) / size; 1434 ctxt->dst.count = (rc->end - rc->pos) / size;
@@ -1691,21 +1707,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1691 1707
1692static void write_register_operand(struct operand *op) 1708static void write_register_operand(struct operand *op)
1693{ 1709{
1694 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ 1710 return assign_register(op->addr.reg, op->val, op->bytes);
1695 switch (op->bytes) {
1696 case 1:
1697 *(u8 *)op->addr.reg = (u8)op->val;
1698 break;
1699 case 2:
1700 *(u16 *)op->addr.reg = (u16)op->val;
1701 break;
1702 case 4:
1703 *op->addr.reg = (u32)op->val;
1704 break; /* 64b: zero-extend */
1705 case 8:
1706 *op->addr.reg = op->val;
1707 break;
1708 }
1709} 1711}
1710 1712
1711static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) 1713static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
@@ -1792,32 +1794,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1792{ 1794{
1793 int rc; 1795 int rc;
1794 unsigned long val, change_mask; 1796 unsigned long val, change_mask;
1795 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1797 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
1796 int cpl = ctxt->ops->cpl(ctxt); 1798 int cpl = ctxt->ops->cpl(ctxt);
1797 1799
1798 rc = emulate_pop(ctxt, &val, len); 1800 rc = emulate_pop(ctxt, &val, len);
1799 if (rc != X86EMUL_CONTINUE) 1801 if (rc != X86EMUL_CONTINUE)
1800 return rc; 1802 return rc;
1801 1803
1802 change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF 1804 change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
1803 | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; 1805 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
1806 X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
1807 X86_EFLAGS_AC | X86_EFLAGS_ID;
1804 1808
1805 switch(ctxt->mode) { 1809 switch(ctxt->mode) {
1806 case X86EMUL_MODE_PROT64: 1810 case X86EMUL_MODE_PROT64:
1807 case X86EMUL_MODE_PROT32: 1811 case X86EMUL_MODE_PROT32:
1808 case X86EMUL_MODE_PROT16: 1812 case X86EMUL_MODE_PROT16:
1809 if (cpl == 0) 1813 if (cpl == 0)
1810 change_mask |= EFLG_IOPL; 1814 change_mask |= X86_EFLAGS_IOPL;
1811 if (cpl <= iopl) 1815 if (cpl <= iopl)
1812 change_mask |= EFLG_IF; 1816 change_mask |= X86_EFLAGS_IF;
1813 break; 1817 break;
1814 case X86EMUL_MODE_VM86: 1818 case X86EMUL_MODE_VM86:
1815 if (iopl < 3) 1819 if (iopl < 3)
1816 return emulate_gp(ctxt, 0); 1820 return emulate_gp(ctxt, 0);
1817 change_mask |= EFLG_IF; 1821 change_mask |= X86_EFLAGS_IF;
1818 break; 1822 break;
1819 default: /* real mode */ 1823 default: /* real mode */
1820 change_mask |= (EFLG_IOPL | EFLG_IF); 1824 change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
1821 break; 1825 break;
1822 } 1826 }
1823 1827
@@ -1918,7 +1922,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
1918 1922
1919static int em_pushf(struct x86_emulate_ctxt *ctxt) 1923static int em_pushf(struct x86_emulate_ctxt *ctxt)
1920{ 1924{
1921 ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM; 1925 ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
1922 return em_push(ctxt); 1926 return em_push(ctxt);
1923} 1927}
1924 1928
@@ -1926,6 +1930,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1926{ 1930{
1927 int rc = X86EMUL_CONTINUE; 1931 int rc = X86EMUL_CONTINUE;
1928 int reg = VCPU_REGS_RDI; 1932 int reg = VCPU_REGS_RDI;
1933 u32 val;
1929 1934
1930 while (reg >= VCPU_REGS_RAX) { 1935 while (reg >= VCPU_REGS_RAX) {
1931 if (reg == VCPU_REGS_RSP) { 1936 if (reg == VCPU_REGS_RSP) {
@@ -1933,9 +1938,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1933 --reg; 1938 --reg;
1934 } 1939 }
1935 1940
1936 rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes); 1941 rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
1937 if (rc != X86EMUL_CONTINUE) 1942 if (rc != X86EMUL_CONTINUE)
1938 break; 1943 break;
1944 assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
1939 --reg; 1945 --reg;
1940 } 1946 }
1941 return rc; 1947 return rc;
@@ -1956,7 +1962,7 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1956 if (rc != X86EMUL_CONTINUE) 1962 if (rc != X86EMUL_CONTINUE)
1957 return rc; 1963 return rc;
1958 1964
1959 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); 1965 ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
1960 1966
1961 ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); 1967 ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1962 rc = em_push(ctxt); 1968 rc = em_push(ctxt);
@@ -2022,10 +2028,14 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
2022 unsigned long temp_eip = 0; 2028 unsigned long temp_eip = 0;
2023 unsigned long temp_eflags = 0; 2029 unsigned long temp_eflags = 0;
2024 unsigned long cs = 0; 2030 unsigned long cs = 0;
2025 unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | 2031 unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
2026 EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | 2032 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
2027 EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ 2033 X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
2028 unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; 2034 X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
2035 X86_EFLAGS_AC | X86_EFLAGS_ID |
2036 X86_EFLAGS_FIXED;
2037 unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
2038 X86_EFLAGS_VIP;
2029 2039
2030 /* TODO: Add stack limit check */ 2040 /* TODO: Add stack limit check */
2031 2041
@@ -2054,7 +2064,6 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
2054 2064
2055 ctxt->_eip = temp_eip; 2065 ctxt->_eip = temp_eip;
2056 2066
2057
2058 if (ctxt->op_bytes == 4) 2067 if (ctxt->op_bytes == 4)
2059 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); 2068 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
2060 else if (ctxt->op_bytes == 2) { 2069 else if (ctxt->op_bytes == 2) {
@@ -2063,7 +2072,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
2063 } 2072 }
2064 2073
2065 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ 2074 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
2066 ctxt->eflags |= EFLG_RESERVED_ONE_MASK; 2075 ctxt->eflags |= X86_EFLAGS_FIXED;
2067 ctxt->ops->set_nmi_mask(ctxt, false); 2076 ctxt->ops->set_nmi_mask(ctxt, false);
2068 2077
2069 return rc; 2078 return rc;
@@ -2145,12 +2154,12 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
2145 ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { 2154 ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
2146 *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); 2155 *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
2147 *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); 2156 *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
2148 ctxt->eflags &= ~EFLG_ZF; 2157 ctxt->eflags &= ~X86_EFLAGS_ZF;
2149 } else { 2158 } else {
2150 ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | 2159 ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
2151 (u32) reg_read(ctxt, VCPU_REGS_RBX); 2160 (u32) reg_read(ctxt, VCPU_REGS_RBX);
2152 2161
2153 ctxt->eflags |= EFLG_ZF; 2162 ctxt->eflags |= X86_EFLAGS_ZF;
2154 } 2163 }
2155 return X86EMUL_CONTINUE; 2164 return X86EMUL_CONTINUE;
2156} 2165}
@@ -2222,7 +2231,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2222 ctxt->src.val = ctxt->dst.orig_val; 2231 ctxt->src.val = ctxt->dst.orig_val;
2223 fastop(ctxt, em_cmp); 2232 fastop(ctxt, em_cmp);
2224 2233
2225 if (ctxt->eflags & EFLG_ZF) { 2234 if (ctxt->eflags & X86_EFLAGS_ZF) {
2226 /* Success: write back to memory; no update of EAX */ 2235 /* Success: write back to memory; no update of EAX */
2227 ctxt->src.type = OP_NONE; 2236 ctxt->src.type = OP_NONE;
2228 ctxt->dst.val = ctxt->src.orig_val; 2237 ctxt->dst.val = ctxt->src.orig_val;
@@ -2381,14 +2390,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
2381 2390
2382 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); 2391 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
2383 ctxt->eflags &= ~msr_data; 2392 ctxt->eflags &= ~msr_data;
2384 ctxt->eflags |= EFLG_RESERVED_ONE_MASK; 2393 ctxt->eflags |= X86_EFLAGS_FIXED;
2385#endif 2394#endif
2386 } else { 2395 } else {
2387 /* legacy mode */ 2396 /* legacy mode */
2388 ops->get_msr(ctxt, MSR_STAR, &msr_data); 2397 ops->get_msr(ctxt, MSR_STAR, &msr_data);
2389 ctxt->_eip = (u32)msr_data; 2398 ctxt->_eip = (u32)msr_data;
2390 2399
2391 ctxt->eflags &= ~(EFLG_VM | EFLG_IF); 2400 ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
2392 } 2401 }
2393 2402
2394 return X86EMUL_CONTINUE; 2403 return X86EMUL_CONTINUE;
@@ -2425,8 +2434,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2425 if ((msr_data & 0xfffc) == 0x0) 2434 if ((msr_data & 0xfffc) == 0x0)
2426 return emulate_gp(ctxt, 0); 2435 return emulate_gp(ctxt, 0);
2427 2436
2428 ctxt->eflags &= ~(EFLG_VM | EFLG_IF); 2437 ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
2429 cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK; 2438 cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
2430 ss_sel = cs_sel + 8; 2439 ss_sel = cs_sel + 8;
2431 if (efer & EFER_LMA) { 2440 if (efer & EFER_LMA) {
2432 cs.d = 0; 2441 cs.d = 0;
@@ -2493,8 +2502,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2493 return emulate_gp(ctxt, 0); 2502 return emulate_gp(ctxt, 0);
2494 break; 2503 break;
2495 } 2504 }
2496 cs_sel |= SELECTOR_RPL_MASK; 2505 cs_sel |= SEGMENT_RPL_MASK;
2497 ss_sel |= SELECTOR_RPL_MASK; 2506 ss_sel |= SEGMENT_RPL_MASK;
2498 2507
2499 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2508 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2500 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2509 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
@@ -2512,7 +2521,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2512 return false; 2521 return false;
2513 if (ctxt->mode == X86EMUL_MODE_VM86) 2522 if (ctxt->mode == X86EMUL_MODE_VM86)
2514 return true; 2523 return true;
2515 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2524 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
2516 return ctxt->ops->cpl(ctxt) > iopl; 2525 return ctxt->ops->cpl(ctxt) > iopl;
2517} 2526}
2518 2527
@@ -2782,10 +2791,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2782 return ret; 2791 return ret;
2783 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, 2792 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
2784 X86_TRANSFER_TASK_SWITCH, NULL); 2793 X86_TRANSFER_TASK_SWITCH, NULL);
2785 if (ret != X86EMUL_CONTINUE)
2786 return ret;
2787 2794
2788 return X86EMUL_CONTINUE; 2795 return ret;
2789} 2796}
2790 2797
2791static int task_switch_32(struct x86_emulate_ctxt *ctxt, 2798static int task_switch_32(struct x86_emulate_ctxt *ctxt,
@@ -2954,7 +2961,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2954static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, 2961static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
2955 struct operand *op) 2962 struct operand *op)
2956{ 2963{
2957 int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; 2964 int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
2958 2965
2959 register_address_increment(ctxt, reg, df * op->bytes); 2966 register_address_increment(ctxt, reg, df * op->bytes);
2960 op->addr.mem.ea = register_address(ctxt, reg); 2967 op->addr.mem.ea = register_address(ctxt, reg);
@@ -3323,7 +3330,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
3323 return X86EMUL_CONTINUE; 3330 return X86EMUL_CONTINUE;
3324} 3331}
3325 3332
3326static int em_vmcall(struct x86_emulate_ctxt *ctxt) 3333static int em_hypercall(struct x86_emulate_ctxt *ctxt)
3327{ 3334{
3328 int rc = ctxt->ops->fix_hypercall(ctxt); 3335 int rc = ctxt->ops->fix_hypercall(ctxt);
3329 3336
@@ -3395,17 +3402,6 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt)
3395 return em_lgdt_lidt(ctxt, true); 3402 return em_lgdt_lidt(ctxt, true);
3396} 3403}
3397 3404
3398static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
3399{
3400 int rc;
3401
3402 rc = ctxt->ops->fix_hypercall(ctxt);
3403
3404 /* Disable writeback. */
3405 ctxt->dst.type = OP_NONE;
3406 return rc;
3407}
3408
3409static int em_lidt(struct x86_emulate_ctxt *ctxt) 3405static int em_lidt(struct x86_emulate_ctxt *ctxt)
3410{ 3406{
3411 return em_lgdt_lidt(ctxt, false); 3407 return em_lgdt_lidt(ctxt, false);
@@ -3504,7 +3500,8 @@ static int em_sahf(struct x86_emulate_ctxt *ctxt)
3504{ 3500{
3505 u32 flags; 3501 u32 flags;
3506 3502
3507 flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF; 3503 flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
3504 X86_EFLAGS_SF;
3508 flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; 3505 flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
3509 3506
3510 ctxt->eflags &= ~0xffUL; 3507 ctxt->eflags &= ~0xffUL;
@@ -3769,7 +3766,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3769 3766
3770static const struct opcode group7_rm0[] = { 3767static const struct opcode group7_rm0[] = {
3771 N, 3768 N,
3772 I(SrcNone | Priv | EmulateOnUD, em_vmcall), 3769 I(SrcNone | Priv | EmulateOnUD, em_hypercall),
3773 N, N, N, N, N, N, 3770 N, N, N, N, N, N,
3774}; 3771};
3775 3772
@@ -3781,7 +3778,7 @@ static const struct opcode group7_rm1[] = {
3781 3778
3782static const struct opcode group7_rm3[] = { 3779static const struct opcode group7_rm3[] = {
3783 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), 3780 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
3784 II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall), 3781 II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
3785 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), 3782 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
3786 DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), 3783 DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
3787 DIP(SrcNone | Prot | Priv, stgi, check_svme), 3784 DIP(SrcNone | Prot | Priv, stgi, check_svme),
@@ -4192,7 +4189,8 @@ static const struct opcode twobyte_table[256] = {
4192 N, N, 4189 N, N,
4193 G(BitOp, group8), 4190 G(BitOp, group8),
4194 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 4191 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
4195 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), 4192 I(DstReg | SrcMem | ModRM, em_bsf_c),
4193 I(DstReg | SrcMem | ModRM, em_bsr_c),
4196 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 4194 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
4197 /* 0xC0 - 0xC7 */ 4195 /* 0xC0 - 0xC7 */
4198 F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), 4196 F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
@@ -4759,9 +4757,9 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
4759 if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || 4757 if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
4760 (ctxt->b == 0xae) || (ctxt->b == 0xaf)) 4758 (ctxt->b == 0xae) || (ctxt->b == 0xaf))
4761 && (((ctxt->rep_prefix == REPE_PREFIX) && 4759 && (((ctxt->rep_prefix == REPE_PREFIX) &&
4762 ((ctxt->eflags & EFLG_ZF) == 0)) 4760 ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
4763 || ((ctxt->rep_prefix == REPNE_PREFIX) && 4761 || ((ctxt->rep_prefix == REPNE_PREFIX) &&
4764 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) 4762 ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
4765 return true; 4763 return true;
4766 4764
4767 return false; 4765 return false;
@@ -4913,7 +4911,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4913 /* All REP prefixes have the same first termination condition */ 4911 /* All REP prefixes have the same first termination condition */
4914 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { 4912 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
4915 ctxt->eip = ctxt->_eip; 4913 ctxt->eip = ctxt->_eip;
4916 ctxt->eflags &= ~EFLG_RF; 4914 ctxt->eflags &= ~X86_EFLAGS_RF;
4917 goto done; 4915 goto done;
4918 } 4916 }
4919 } 4917 }
@@ -4963,9 +4961,9 @@ special_insn:
4963 } 4961 }
4964 4962
4965 if (ctxt->rep_prefix && (ctxt->d & String)) 4963 if (ctxt->rep_prefix && (ctxt->d & String))
4966 ctxt->eflags |= EFLG_RF; 4964 ctxt->eflags |= X86_EFLAGS_RF;
4967 else 4965 else
4968 ctxt->eflags &= ~EFLG_RF; 4966 ctxt->eflags &= ~X86_EFLAGS_RF;
4969 4967
4970 if (ctxt->execute) { 4968 if (ctxt->execute) {
4971 if (ctxt->d & Fastop) { 4969 if (ctxt->d & Fastop) {
@@ -5014,7 +5012,7 @@ special_insn:
5014 rc = emulate_int(ctxt, ctxt->src.val); 5012 rc = emulate_int(ctxt, ctxt->src.val);
5015 break; 5013 break;
5016 case 0xce: /* into */ 5014 case 0xce: /* into */
5017 if (ctxt->eflags & EFLG_OF) 5015 if (ctxt->eflags & X86_EFLAGS_OF)
5018 rc = emulate_int(ctxt, 4); 5016 rc = emulate_int(ctxt, 4);
5019 break; 5017 break;
5020 case 0xe9: /* jmp rel */ 5018 case 0xe9: /* jmp rel */
@@ -5027,19 +5025,19 @@ special_insn:
5027 break; 5025 break;
5028 case 0xf5: /* cmc */ 5026 case 0xf5: /* cmc */
5029 /* complement carry flag from eflags reg */ 5027 /* complement carry flag from eflags reg */
5030 ctxt->eflags ^= EFLG_CF; 5028 ctxt->eflags ^= X86_EFLAGS_CF;
5031 break; 5029 break;
5032 case 0xf8: /* clc */ 5030 case 0xf8: /* clc */
5033 ctxt->eflags &= ~EFLG_CF; 5031 ctxt->eflags &= ~X86_EFLAGS_CF;
5034 break; 5032 break;
5035 case 0xf9: /* stc */ 5033 case 0xf9: /* stc */
5036 ctxt->eflags |= EFLG_CF; 5034 ctxt->eflags |= X86_EFLAGS_CF;
5037 break; 5035 break;
5038 case 0xfc: /* cld */ 5036 case 0xfc: /* cld */
5039 ctxt->eflags &= ~EFLG_DF; 5037 ctxt->eflags &= ~X86_EFLAGS_DF;
5040 break; 5038 break;
5041 case 0xfd: /* std */ 5039 case 0xfd: /* std */
5042 ctxt->eflags |= EFLG_DF; 5040 ctxt->eflags |= X86_EFLAGS_DF;
5043 break; 5041 break;
5044 default: 5042 default:
5045 goto cannot_emulate; 5043 goto cannot_emulate;
@@ -5100,7 +5098,7 @@ writeback:
5100 } 5098 }
5101 goto done; /* skip rip writeback */ 5099 goto done; /* skip rip writeback */
5102 } 5100 }
5103 ctxt->eflags &= ~EFLG_RF; 5101 ctxt->eflags &= ~X86_EFLAGS_RF;
5104 } 5102 }
5105 5103
5106 ctxt->eip = ctxt->_eip; 5104 ctxt->eip = ctxt->_eip;
@@ -5137,8 +5135,7 @@ twobyte_insn:
5137 case 0x40 ... 0x4f: /* cmov */ 5135 case 0x40 ... 0x4f: /* cmov */
5138 if (test_cc(ctxt->b, ctxt->eflags)) 5136 if (test_cc(ctxt->b, ctxt->eflags))
5139 ctxt->dst.val = ctxt->src.val; 5137 ctxt->dst.val = ctxt->src.val;
5140 else if (ctxt->mode != X86EMUL_MODE_PROT64 || 5138 else if (ctxt->op_bytes != 4)
5141 ctxt->op_bytes != 4)
5142 ctxt->dst.type = OP_NONE; /* no writeback */ 5139 ctxt->dst.type = OP_NONE; /* no writeback */
5143 break; 5140 break;
5144 case 0x80 ... 0x8f: /* jnz rel, etc*/ 5141 case 0x80 ... 0x8f: /* jnz rel, etc*/
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 298781d4cfb4..4dce6f8b6129 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -443,7 +443,8 @@ static inline int pit_in_range(gpa_t addr)
443 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); 443 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
444} 444}
445 445
446static int pit_ioport_write(struct kvm_io_device *this, 446static int pit_ioport_write(struct kvm_vcpu *vcpu,
447 struct kvm_io_device *this,
447 gpa_t addr, int len, const void *data) 448 gpa_t addr, int len, const void *data)
448{ 449{
449 struct kvm_pit *pit = dev_to_pit(this); 450 struct kvm_pit *pit = dev_to_pit(this);
@@ -519,7 +520,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
519 return 0; 520 return 0;
520} 521}
521 522
522static int pit_ioport_read(struct kvm_io_device *this, 523static int pit_ioport_read(struct kvm_vcpu *vcpu,
524 struct kvm_io_device *this,
523 gpa_t addr, int len, void *data) 525 gpa_t addr, int len, void *data)
524{ 526{
525 struct kvm_pit *pit = dev_to_pit(this); 527 struct kvm_pit *pit = dev_to_pit(this);
@@ -589,7 +591,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
589 return 0; 591 return 0;
590} 592}
591 593
592static int speaker_ioport_write(struct kvm_io_device *this, 594static int speaker_ioport_write(struct kvm_vcpu *vcpu,
595 struct kvm_io_device *this,
593 gpa_t addr, int len, const void *data) 596 gpa_t addr, int len, const void *data)
594{ 597{
595 struct kvm_pit *pit = speaker_to_pit(this); 598 struct kvm_pit *pit = speaker_to_pit(this);
@@ -606,8 +609,9 @@ static int speaker_ioport_write(struct kvm_io_device *this,
606 return 0; 609 return 0;
607} 610}
608 611
609static int speaker_ioport_read(struct kvm_io_device *this, 612static int speaker_ioport_read(struct kvm_vcpu *vcpu,
610 gpa_t addr, int len, void *data) 613 struct kvm_io_device *this,
614 gpa_t addr, int len, void *data)
611{ 615{
612 struct kvm_pit *pit = speaker_to_pit(this); 616 struct kvm_pit *pit = speaker_to_pit(this);
613 struct kvm_kpit_state *pit_state = &pit->pit_state; 617 struct kvm_kpit_state *pit_state = &pit->pit_state;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index dd1b16b611b0..c84990b42b5b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,7 +3,7 @@
3 3
4#include <linux/kthread.h> 4#include <linux/kthread.h>
5 5
6#include "iodev.h" 6#include <kvm/iodev.h>
7 7
8struct kvm_kpit_channel_state { 8struct kvm_kpit_channel_state {
9 u32 count; /* can be 65536 */ 9 u32 count; /* can be 65536 */
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 9541ba34126b..fef922ff2635 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -529,42 +529,42 @@ static int picdev_read(struct kvm_pic *s,
529 return 0; 529 return 0;
530} 530}
531 531
532static int picdev_master_write(struct kvm_io_device *dev, 532static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
533 gpa_t addr, int len, const void *val) 533 gpa_t addr, int len, const void *val)
534{ 534{
535 return picdev_write(container_of(dev, struct kvm_pic, dev_master), 535 return picdev_write(container_of(dev, struct kvm_pic, dev_master),
536 addr, len, val); 536 addr, len, val);
537} 537}
538 538
539static int picdev_master_read(struct kvm_io_device *dev, 539static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
540 gpa_t addr, int len, void *val) 540 gpa_t addr, int len, void *val)
541{ 541{
542 return picdev_read(container_of(dev, struct kvm_pic, dev_master), 542 return picdev_read(container_of(dev, struct kvm_pic, dev_master),
543 addr, len, val); 543 addr, len, val);
544} 544}
545 545
546static int picdev_slave_write(struct kvm_io_device *dev, 546static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
547 gpa_t addr, int len, const void *val) 547 gpa_t addr, int len, const void *val)
548{ 548{
549 return picdev_write(container_of(dev, struct kvm_pic, dev_slave), 549 return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
550 addr, len, val); 550 addr, len, val);
551} 551}
552 552
553static int picdev_slave_read(struct kvm_io_device *dev, 553static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
554 gpa_t addr, int len, void *val) 554 gpa_t addr, int len, void *val)
555{ 555{
556 return picdev_read(container_of(dev, struct kvm_pic, dev_slave), 556 return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
557 addr, len, val); 557 addr, len, val);
558} 558}
559 559
560static int picdev_eclr_write(struct kvm_io_device *dev, 560static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
561 gpa_t addr, int len, const void *val) 561 gpa_t addr, int len, const void *val)
562{ 562{
563 return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), 563 return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
564 addr, len, val); 564 addr, len, val);
565} 565}
566 566
567static int picdev_eclr_read(struct kvm_io_device *dev, 567static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
568 gpa_t addr, int len, void *val) 568 gpa_t addr, int len, void *val)
569{ 569{
570 return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), 570 return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 46d4449772bc..28146f03c514 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
206 206
207 old_irr = ioapic->irr; 207 old_irr = ioapic->irr;
208 ioapic->irr |= mask; 208 ioapic->irr |= mask;
209 if (edge)
210 ioapic->irr_delivered &= ~mask;
209 if ((edge && old_irr == ioapic->irr) || 211 if ((edge && old_irr == ioapic->irr) ||
210 (!edge && entry.fields.remote_irr)) { 212 (!edge && entry.fields.remote_irr)) {
211 ret = 0; 213 ret = 0;
@@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
349 irqe.shorthand = 0; 351 irqe.shorthand = 0;
350 352
351 if (irqe.trig_mode == IOAPIC_EDGE_TRIG) 353 if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
352 ioapic->irr &= ~(1 << irq); 354 ioapic->irr_delivered |= 1 << irq;
353 355
354 if (irq == RTC_GSI && line_status) { 356 if (irq == RTC_GSI && line_status) {
355 /* 357 /*
@@ -473,13 +475,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
473 } 475 }
474} 476}
475 477
476bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
477{
478 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
479 smp_rmb();
480 return test_bit(vector, ioapic->handled_vectors);
481}
482
483void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) 478void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
484{ 479{
485 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 480 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
@@ -500,8 +495,8 @@ static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
500 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); 495 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
501} 496}
502 497
503static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, 498static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
504 void *val) 499 gpa_t addr, int len, void *val)
505{ 500{
506 struct kvm_ioapic *ioapic = to_ioapic(this); 501 struct kvm_ioapic *ioapic = to_ioapic(this);
507 u32 result; 502 u32 result;
@@ -543,8 +538,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
543 return 0; 538 return 0;
544} 539}
545 540
546static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, 541static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
547 const void *val) 542 gpa_t addr, int len, const void *val)
548{ 543{
549 struct kvm_ioapic *ioapic = to_ioapic(this); 544 struct kvm_ioapic *ioapic = to_ioapic(this);
550 u32 data; 545 u32 data;
@@ -599,6 +594,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
599 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; 594 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
600 ioapic->ioregsel = 0; 595 ioapic->ioregsel = 0;
601 ioapic->irr = 0; 596 ioapic->irr = 0;
597 ioapic->irr_delivered = 0;
602 ioapic->id = 0; 598 ioapic->id = 0;
603 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); 599 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
604 rtc_irq_eoi_tracking_reset(ioapic); 600 rtc_irq_eoi_tracking_reset(ioapic);
@@ -656,6 +652,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
656 652
657 spin_lock(&ioapic->lock); 653 spin_lock(&ioapic->lock);
658 memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); 654 memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
655 state->irr &= ~ioapic->irr_delivered;
659 spin_unlock(&ioapic->lock); 656 spin_unlock(&ioapic->lock);
660 return 0; 657 return 0;
661} 658}
@@ -669,6 +666,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
669 spin_lock(&ioapic->lock); 666 spin_lock(&ioapic->lock);
670 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); 667 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
671 ioapic->irr = 0; 668 ioapic->irr = 0;
669 ioapic->irr_delivered = 0;
672 update_handled_vectors(ioapic); 670 update_handled_vectors(ioapic);
673 kvm_vcpu_request_scan_ioapic(kvm); 671 kvm_vcpu_request_scan_ioapic(kvm);
674 kvm_ioapic_inject_all(ioapic, state->irr); 672 kvm_ioapic_inject_all(ioapic, state->irr);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index c2e36d934af4..ca0b0b4e6256 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -3,7 +3,7 @@
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5 5
6#include "iodev.h" 6#include <kvm/iodev.h>
7 7
8struct kvm; 8struct kvm;
9struct kvm_vcpu; 9struct kvm_vcpu;
@@ -77,6 +77,7 @@ struct kvm_ioapic {
77 struct rtc_status rtc_status; 77 struct rtc_status rtc_status;
78 struct delayed_work eoi_inject; 78 struct delayed_work eoi_inject;
79 u32 irq_eoi[IOAPIC_NUM_PINS]; 79 u32 irq_eoi[IOAPIC_NUM_PINS];
80 u32 irr_delivered;
80}; 81};
81 82
82#ifdef DEBUG 83#ifdef DEBUG
@@ -97,13 +98,19 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
97 return kvm->arch.vioapic; 98 return kvm->arch.vioapic;
98} 99}
99 100
101static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
102{
103 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
104 smp_rmb();
105 return test_bit(vector, ioapic->handled_vectors);
106}
107
100void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); 108void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
101bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 109bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
102 int short_hand, unsigned int dest, int dest_mode); 110 int short_hand, unsigned int dest, int dest_mode);
103int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 111int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
104void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, 112void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
105 int trigger_mode); 113 int trigger_mode);
106bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
107int kvm_ioapic_init(struct kvm *kvm); 114int kvm_ioapic_init(struct kvm *kvm);
108void kvm_ioapic_destroy(struct kvm *kvm); 115void kvm_ioapic_destroy(struct kvm *kvm);
109int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 116int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2d03568e9498..ad68c73008c5 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -27,7 +27,7 @@
27#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29 29
30#include "iodev.h" 30#include <kvm/iodev.h>
31#include "ioapic.h" 31#include "ioapic.h"
32#include "lapic.h" 32#include "lapic.h"
33 33
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4ee827d7bf36..d67206a7b99a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -133,6 +133,28 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
133 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 133 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
134} 134}
135 135
136/* The logical map is definitely wrong if we have multiple
137 * modes at the same time. (Physical map is always right.)
138 */
139static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
140{
141 return !(map->mode & (map->mode - 1));
142}
143
144static inline void
145apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
146{
147 unsigned lid_bits;
148
149 BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4);
150 BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8);
151 BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16);
152 lid_bits = map->mode;
153
154 *cid = dest_id >> lid_bits;
155 *lid = dest_id & ((1 << lid_bits) - 1);
156}
157
136static void recalculate_apic_map(struct kvm *kvm) 158static void recalculate_apic_map(struct kvm *kvm)
137{ 159{
138 struct kvm_apic_map *new, *old = NULL; 160 struct kvm_apic_map *new, *old = NULL;
@@ -146,48 +168,6 @@ static void recalculate_apic_map(struct kvm *kvm)
146 if (!new) 168 if (!new)
147 goto out; 169 goto out;
148 170
149 new->ldr_bits = 8;
150 /* flat mode is default */
151 new->cid_shift = 8;
152 new->cid_mask = 0;
153 new->lid_mask = 0xff;
154 new->broadcast = APIC_BROADCAST;
155
156 kvm_for_each_vcpu(i, vcpu, kvm) {
157 struct kvm_lapic *apic = vcpu->arch.apic;
158
159 if (!kvm_apic_present(vcpu))
160 continue;
161
162 if (apic_x2apic_mode(apic)) {
163 new->ldr_bits = 32;
164 new->cid_shift = 16;
165 new->cid_mask = new->lid_mask = 0xffff;
166 new->broadcast = X2APIC_BROADCAST;
167 } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
168 if (kvm_apic_get_reg(apic, APIC_DFR) ==
169 APIC_DFR_CLUSTER) {
170 new->cid_shift = 4;
171 new->cid_mask = 0xf;
172 new->lid_mask = 0xf;
173 } else {
174 new->cid_shift = 8;
175 new->cid_mask = 0;
176 new->lid_mask = 0xff;
177 }
178 }
179
180 /*
181 * All APICs have to be configured in the same mode by an OS.
182 * We take advatage of this while building logical id loockup
183 * table. After reset APICs are in software disabled mode, so if
184 * we find apic with different setting we assume this is the mode
185 * OS wants all apics to be in; build lookup table accordingly.
186 */
187 if (kvm_apic_sw_enabled(apic))
188 break;
189 }
190
191 kvm_for_each_vcpu(i, vcpu, kvm) { 171 kvm_for_each_vcpu(i, vcpu, kvm) {
192 struct kvm_lapic *apic = vcpu->arch.apic; 172 struct kvm_lapic *apic = vcpu->arch.apic;
193 u16 cid, lid; 173 u16 cid, lid;
@@ -198,11 +178,25 @@ static void recalculate_apic_map(struct kvm *kvm)
198 178
199 aid = kvm_apic_id(apic); 179 aid = kvm_apic_id(apic);
200 ldr = kvm_apic_get_reg(apic, APIC_LDR); 180 ldr = kvm_apic_get_reg(apic, APIC_LDR);
201 cid = apic_cluster_id(new, ldr);
202 lid = apic_logical_id(new, ldr);
203 181
204 if (aid < ARRAY_SIZE(new->phys_map)) 182 if (aid < ARRAY_SIZE(new->phys_map))
205 new->phys_map[aid] = apic; 183 new->phys_map[aid] = apic;
184
185 if (apic_x2apic_mode(apic)) {
186 new->mode |= KVM_APIC_MODE_X2APIC;
187 } else if (ldr) {
188 ldr = GET_APIC_LOGICAL_ID(ldr);
189 if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
190 new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
191 else
192 new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
193 }
194
195 if (!kvm_apic_logical_map_valid(new))
196 continue;
197
198 apic_logical_id(new, ldr, &cid, &lid);
199
206 if (lid && cid < ARRAY_SIZE(new->logical_map)) 200 if (lid && cid < ARRAY_SIZE(new->logical_map))
207 new->logical_map[cid][ffs(lid) - 1] = apic; 201 new->logical_map[cid][ffs(lid) - 1] = apic;
208 } 202 }
@@ -588,15 +582,23 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
588 apic_update_ppr(apic); 582 apic_update_ppr(apic);
589} 583}
590 584
591static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) 585static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
592{ 586{
593 return dest == (apic_x2apic_mode(apic) ? 587 if (apic_x2apic_mode(apic))
594 X2APIC_BROADCAST : APIC_BROADCAST); 588 return mda == X2APIC_BROADCAST;
589
590 return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
595} 591}
596 592
597static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) 593static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
598{ 594{
599 return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); 595 if (kvm_apic_broadcast(apic, mda))
596 return true;
597
598 if (apic_x2apic_mode(apic))
599 return mda == kvm_apic_id(apic);
600
601 return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
600} 602}
601 603
602static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) 604static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -613,6 +615,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
613 && (logical_id & mda & 0xffff) != 0; 615 && (logical_id & mda & 0xffff) != 0;
614 616
615 logical_id = GET_APIC_LOGICAL_ID(logical_id); 617 logical_id = GET_APIC_LOGICAL_ID(logical_id);
618 mda = GET_APIC_DEST_FIELD(mda);
616 619
617 switch (kvm_apic_get_reg(apic, APIC_DFR)) { 620 switch (kvm_apic_get_reg(apic, APIC_DFR)) {
618 case APIC_DFR_FLAT: 621 case APIC_DFR_FLAT:
@@ -627,10 +630,27 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
627 } 630 }
628} 631}
629 632
633/* KVM APIC implementation has two quirks
634 * - dest always begins at 0 while xAPIC MDA has offset 24,
635 * - IOxAPIC messages have to be delivered (directly) to x2APIC.
636 */
637static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
638 struct kvm_lapic *target)
639{
640 bool ipi = source != NULL;
641 bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
642
643 if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
644 return X2APIC_BROADCAST;
645
646 return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
647}
648
630bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 649bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
631 int short_hand, unsigned int dest, int dest_mode) 650 int short_hand, unsigned int dest, int dest_mode)
632{ 651{
633 struct kvm_lapic *target = vcpu->arch.apic; 652 struct kvm_lapic *target = vcpu->arch.apic;
653 u32 mda = kvm_apic_mda(dest, source, target);
634 654
635 apic_debug("target %p, source %p, dest 0x%x, " 655 apic_debug("target %p, source %p, dest 0x%x, "
636 "dest_mode 0x%x, short_hand 0x%x\n", 656 "dest_mode 0x%x, short_hand 0x%x\n",
@@ -640,9 +660,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
640 switch (short_hand) { 660 switch (short_hand) {
641 case APIC_DEST_NOSHORT: 661 case APIC_DEST_NOSHORT:
642 if (dest_mode == APIC_DEST_PHYSICAL) 662 if (dest_mode == APIC_DEST_PHYSICAL)
643 return kvm_apic_match_physical_addr(target, dest); 663 return kvm_apic_match_physical_addr(target, mda);
644 else 664 else
645 return kvm_apic_match_logical_addr(target, dest); 665 return kvm_apic_match_logical_addr(target, mda);
646 case APIC_DEST_SELF: 666 case APIC_DEST_SELF:
647 return target == source; 667 return target == source;
648 case APIC_DEST_ALLINC: 668 case APIC_DEST_ALLINC:
@@ -664,6 +684,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
664 struct kvm_lapic **dst; 684 struct kvm_lapic **dst;
665 int i; 685 int i;
666 bool ret = false; 686 bool ret = false;
687 bool x2apic_ipi = src && apic_x2apic_mode(src);
667 688
668 *r = -1; 689 *r = -1;
669 690
@@ -675,15 +696,15 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
675 if (irq->shorthand) 696 if (irq->shorthand)
676 return false; 697 return false;
677 698
699 if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
700 return false;
701
678 rcu_read_lock(); 702 rcu_read_lock();
679 map = rcu_dereference(kvm->arch.apic_map); 703 map = rcu_dereference(kvm->arch.apic_map);
680 704
681 if (!map) 705 if (!map)
682 goto out; 706 goto out;
683 707
684 if (irq->dest_id == map->broadcast)
685 goto out;
686
687 ret = true; 708 ret = true;
688 709
689 if (irq->dest_mode == APIC_DEST_PHYSICAL) { 710 if (irq->dest_mode == APIC_DEST_PHYSICAL) {
@@ -692,16 +713,20 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
692 713
693 dst = &map->phys_map[irq->dest_id]; 714 dst = &map->phys_map[irq->dest_id];
694 } else { 715 } else {
695 u32 mda = irq->dest_id << (32 - map->ldr_bits); 716 u16 cid;
696 u16 cid = apic_cluster_id(map, mda); 717
718 if (!kvm_apic_logical_map_valid(map)) {
719 ret = false;
720 goto out;
721 }
722
723 apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
697 724
698 if (cid >= ARRAY_SIZE(map->logical_map)) 725 if (cid >= ARRAY_SIZE(map->logical_map))
699 goto out; 726 goto out;
700 727
701 dst = map->logical_map[cid]; 728 dst = map->logical_map[cid];
702 729
703 bitmap = apic_logical_id(map, mda);
704
705 if (irq->delivery_mode == APIC_DM_LOWEST) { 730 if (irq->delivery_mode == APIC_DM_LOWEST) {
706 int l = -1; 731 int l = -1;
707 for_each_set_bit(i, &bitmap, 16) { 732 for_each_set_bit(i, &bitmap, 16) {
@@ -1037,7 +1062,7 @@ static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
1037 addr < apic->base_address + LAPIC_MMIO_LENGTH; 1062 addr < apic->base_address + LAPIC_MMIO_LENGTH;
1038} 1063}
1039 1064
1040static int apic_mmio_read(struct kvm_io_device *this, 1065static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
1041 gpa_t address, int len, void *data) 1066 gpa_t address, int len, void *data)
1042{ 1067{
1043 struct kvm_lapic *apic = to_lapic(this); 1068 struct kvm_lapic *apic = to_lapic(this);
@@ -1357,7 +1382,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
1357 return ret; 1382 return ret;
1358} 1383}
1359 1384
1360static int apic_mmio_write(struct kvm_io_device *this, 1385static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
1361 gpa_t address, int len, const void *data) 1386 gpa_t address, int len, const void *data)
1362{ 1387{
1363 struct kvm_lapic *apic = to_lapic(this); 1388 struct kvm_lapic *apic = to_lapic(this);
@@ -1497,8 +1522,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1497 return; 1522 return;
1498 } 1523 }
1499 1524
1500 if (!kvm_vcpu_is_bsp(apic->vcpu))
1501 value &= ~MSR_IA32_APICBASE_BSP;
1502 vcpu->arch.apic_base = value; 1525 vcpu->arch.apic_base = value;
1503 1526
1504 /* update jump label if enable bit changes */ 1527 /* update jump label if enable bit changes */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 0bc6c656625b..9d28383fc1e7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -1,7 +1,7 @@
1#ifndef __KVM_X86_LAPIC_H 1#ifndef __KVM_X86_LAPIC_H
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include <kvm/iodev.h>
5 5
6#include <linux/kvm_host.h> 6#include <linux/kvm_host.h>
7 7
@@ -148,21 +148,6 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
148 return kvm_x86_ops->vm_has_apicv(kvm); 148 return kvm_x86_ops->vm_has_apicv(kvm);
149} 149}
150 150
151static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
152{
153 u16 cid;
154 ldr >>= 32 - map->ldr_bits;
155 cid = (ldr >> map->cid_shift) & map->cid_mask;
156
157 return cid;
158}
159
160static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
161{
162 ldr >>= (32 - map->ldr_bits);
163 return ldr & map->lid_mask;
164}
165
166static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) 151static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
167{ 152{
168 return vcpu->arch.apic->pending_events; 153 return vcpu->arch.apic->pending_events;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cee759299a35..146f295ee322 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4465,6 +4465,79 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
4465 kvm_flush_remote_tlbs(kvm); 4465 kvm_flush_remote_tlbs(kvm);
4466} 4466}
4467 4467
4468static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
4469 unsigned long *rmapp)
4470{
4471 u64 *sptep;
4472 struct rmap_iterator iter;
4473 int need_tlb_flush = 0;
4474 pfn_t pfn;
4475 struct kvm_mmu_page *sp;
4476
4477 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
4478 BUG_ON(!(*sptep & PT_PRESENT_MASK));
4479
4480 sp = page_header(__pa(sptep));
4481 pfn = spte_to_pfn(*sptep);
4482
4483 /*
4484 * Only EPT supported for now; otherwise, one would need to
4485 * find out efficiently whether the guest page tables are
4486 * also using huge pages.
4487 */
4488 if (sp->role.direct &&
4489 !kvm_is_reserved_pfn(pfn) &&
4490 PageTransCompound(pfn_to_page(pfn))) {
4491 drop_spte(kvm, sptep);
4492 sptep = rmap_get_first(*rmapp, &iter);
4493 need_tlb_flush = 1;
4494 } else
4495 sptep = rmap_get_next(&iter);
4496 }
4497
4498 return need_tlb_flush;
4499}
4500
4501void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
4502 struct kvm_memory_slot *memslot)
4503{
4504 bool flush = false;
4505 unsigned long *rmapp;
4506 unsigned long last_index, index;
4507 gfn_t gfn_start, gfn_end;
4508
4509 spin_lock(&kvm->mmu_lock);
4510
4511 gfn_start = memslot->base_gfn;
4512 gfn_end = memslot->base_gfn + memslot->npages - 1;
4513
4514 if (gfn_start >= gfn_end)
4515 goto out;
4516
4517 rmapp = memslot->arch.rmap[0];
4518 last_index = gfn_to_index(gfn_end, memslot->base_gfn,
4519 PT_PAGE_TABLE_LEVEL);
4520
4521 for (index = 0; index <= last_index; ++index, ++rmapp) {
4522 if (*rmapp)
4523 flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
4524
4525 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
4526 if (flush) {
4527 kvm_flush_remote_tlbs(kvm);
4528 flush = false;
4529 }
4530 cond_resched_lock(&kvm->mmu_lock);
4531 }
4532 }
4533
4534 if (flush)
4535 kvm_flush_remote_tlbs(kvm);
4536
4537out:
4538 spin_unlock(&kvm->mmu_lock);
4539}
4540
4468void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 4541void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
4469 struct kvm_memory_slot *memslot) 4542 struct kvm_memory_slot *memslot)
4470{ 4543{
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 8e6b7d869d2f..29fbf9dfdc54 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -38,7 +38,7 @@ static struct kvm_arch_event_perf_mapping {
38}; 38};
39 39
40/* mapping between fixed pmc index and arch_events array */ 40/* mapping between fixed pmc index and arch_events array */
41int fixed_pmc_events[] = {1, 0, 7}; 41static int fixed_pmc_events[] = {1, 0, 7};
42 42
43static bool pmc_is_gp(struct kvm_pmc *pmc) 43static bool pmc_is_gp(struct kvm_pmc *pmc)
44{ 44{
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cc618c882f90..ce741b8650f6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1261,7 +1261,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1261 1261
1262 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | 1262 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1263 MSR_IA32_APICBASE_ENABLE; 1263 MSR_IA32_APICBASE_ENABLE;
1264 if (kvm_vcpu_is_bsp(&svm->vcpu)) 1264 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1265 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 1265 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1266 1266
1267 svm_init_osvw(&svm->vcpu); 1267 svm_init_osvw(&svm->vcpu);
@@ -1929,14 +1929,12 @@ static int nop_on_interception(struct vcpu_svm *svm)
1929static int halt_interception(struct vcpu_svm *svm) 1929static int halt_interception(struct vcpu_svm *svm)
1930{ 1930{
1931 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1931 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1932 skip_emulated_instruction(&svm->vcpu);
1933 return kvm_emulate_halt(&svm->vcpu); 1932 return kvm_emulate_halt(&svm->vcpu);
1934} 1933}
1935 1934
1936static int vmmcall_interception(struct vcpu_svm *svm) 1935static int vmmcall_interception(struct vcpu_svm *svm)
1937{ 1936{
1938 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1937 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1939 skip_emulated_instruction(&svm->vcpu);
1940 kvm_emulate_hypercall(&svm->vcpu); 1938 kvm_emulate_hypercall(&svm->vcpu);
1941 return 1; 1939 return 1;
1942} 1940}
@@ -2757,11 +2755,11 @@ static int invlpga_interception(struct vcpu_svm *svm)
2757{ 2755{
2758 struct kvm_vcpu *vcpu = &svm->vcpu; 2756 struct kvm_vcpu *vcpu = &svm->vcpu;
2759 2757
2760 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], 2758 trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
2761 vcpu->arch.regs[VCPU_REGS_RAX]); 2759 kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2762 2760
2763 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 2761 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2764 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 2762 kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2765 2763
2766 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2764 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2767 skip_emulated_instruction(&svm->vcpu); 2765 skip_emulated_instruction(&svm->vcpu);
@@ -2770,12 +2768,18 @@ static int invlpga_interception(struct vcpu_svm *svm)
2770 2768
2771static int skinit_interception(struct vcpu_svm *svm) 2769static int skinit_interception(struct vcpu_svm *svm)
2772{ 2770{
2773 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]); 2771 trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2774 2772
2775 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2773 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2776 return 1; 2774 return 1;
2777} 2775}
2778 2776
2777static int wbinvd_interception(struct vcpu_svm *svm)
2778{
2779 kvm_emulate_wbinvd(&svm->vcpu);
2780 return 1;
2781}
2782
2779static int xsetbv_interception(struct vcpu_svm *svm) 2783static int xsetbv_interception(struct vcpu_svm *svm)
2780{ 2784{
2781 u64 new_bv = kvm_read_edx_eax(&svm->vcpu); 2785 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
@@ -2902,7 +2906,8 @@ static int rdpmc_interception(struct vcpu_svm *svm)
2902 return 1; 2906 return 1;
2903} 2907}
2904 2908
2905bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) 2909static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
2910 unsigned long val)
2906{ 2911{
2907 unsigned long cr0 = svm->vcpu.arch.cr0; 2912 unsigned long cr0 = svm->vcpu.arch.cr0;
2908 bool ret = false; 2913 bool ret = false;
@@ -2940,7 +2945,10 @@ static int cr_interception(struct vcpu_svm *svm)
2940 return emulate_on_interception(svm); 2945 return emulate_on_interception(svm);
2941 2946
2942 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2947 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2943 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 2948 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2949 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2950 else
2951 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2944 2952
2945 err = 0; 2953 err = 0;
2946 if (cr >= 16) { /* mov to cr */ 2954 if (cr >= 16) { /* mov to cr */
@@ -3133,7 +3141,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
3133 3141
3134static int rdmsr_interception(struct vcpu_svm *svm) 3142static int rdmsr_interception(struct vcpu_svm *svm)
3135{ 3143{
3136 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3144 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3137 u64 data; 3145 u64 data;
3138 3146
3139 if (svm_get_msr(&svm->vcpu, ecx, &data)) { 3147 if (svm_get_msr(&svm->vcpu, ecx, &data)) {
@@ -3142,8 +3150,8 @@ static int rdmsr_interception(struct vcpu_svm *svm)
3142 } else { 3150 } else {
3143 trace_kvm_msr_read(ecx, data); 3151 trace_kvm_msr_read(ecx, data);
3144 3152
3145 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 3153 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
3146 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 3154 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
3147 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3155 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3148 skip_emulated_instruction(&svm->vcpu); 3156 skip_emulated_instruction(&svm->vcpu);
3149 } 3157 }
@@ -3246,9 +3254,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3246static int wrmsr_interception(struct vcpu_svm *svm) 3254static int wrmsr_interception(struct vcpu_svm *svm)
3247{ 3255{
3248 struct msr_data msr; 3256 struct msr_data msr;
3249 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3257 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3250 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 3258 u64 data = kvm_read_edx_eax(&svm->vcpu);
3251 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3252 3259
3253 msr.data = data; 3260 msr.data = data;
3254 msr.index = ecx; 3261 msr.index = ecx;
@@ -3325,7 +3332,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3325 [SVM_EXIT_READ_CR3] = cr_interception, 3332 [SVM_EXIT_READ_CR3] = cr_interception,
3326 [SVM_EXIT_READ_CR4] = cr_interception, 3333 [SVM_EXIT_READ_CR4] = cr_interception,
3327 [SVM_EXIT_READ_CR8] = cr_interception, 3334 [SVM_EXIT_READ_CR8] = cr_interception,
3328 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3335 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
3329 [SVM_EXIT_WRITE_CR0] = cr_interception, 3336 [SVM_EXIT_WRITE_CR0] = cr_interception,
3330 [SVM_EXIT_WRITE_CR3] = cr_interception, 3337 [SVM_EXIT_WRITE_CR3] = cr_interception,
3331 [SVM_EXIT_WRITE_CR4] = cr_interception, 3338 [SVM_EXIT_WRITE_CR4] = cr_interception,
@@ -3376,7 +3383,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3376 [SVM_EXIT_STGI] = stgi_interception, 3383 [SVM_EXIT_STGI] = stgi_interception,
3377 [SVM_EXIT_CLGI] = clgi_interception, 3384 [SVM_EXIT_CLGI] = clgi_interception,
3378 [SVM_EXIT_SKINIT] = skinit_interception, 3385 [SVM_EXIT_SKINIT] = skinit_interception,
3379 [SVM_EXIT_WBINVD] = emulate_on_interception, 3386 [SVM_EXIT_WBINVD] = wbinvd_interception,
3380 [SVM_EXIT_MONITOR] = monitor_interception, 3387 [SVM_EXIT_MONITOR] = monitor_interception,
3381 [SVM_EXIT_MWAIT] = mwait_interception, 3388 [SVM_EXIT_MWAIT] = mwait_interception,
3382 [SVM_EXIT_XSETBV] = xsetbv_interception, 3389 [SVM_EXIT_XSETBV] = xsetbv_interception,
@@ -3555,7 +3562,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
3555 3562
3556 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 3563 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3557 || !svm_exit_handlers[exit_code]) { 3564 || !svm_exit_handlers[exit_code]) {
3558 WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); 3565 WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
3559 kvm_queue_exception(vcpu, UD_VECTOR); 3566 kvm_queue_exception(vcpu, UD_VECTOR);
3560 return 1; 3567 return 1;
3561 } 3568 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ae4f6d35d19c..f5e8dce8046c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2470,6 +2470,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2470 vmx->nested.nested_vmx_secondary_ctls_low = 0; 2470 vmx->nested.nested_vmx_secondary_ctls_low = 0;
2471 vmx->nested.nested_vmx_secondary_ctls_high &= 2471 vmx->nested.nested_vmx_secondary_ctls_high &=
2472 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2472 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2473 SECONDARY_EXEC_RDTSCP |
2473 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2474 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2474 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2475 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2475 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2476 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -3268,8 +3269,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3268 * default value. 3269 * default value.
3269 */ 3270 */
3270 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3271 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3271 save->selector &= ~SELECTOR_RPL_MASK; 3272 save->selector &= ~SEGMENT_RPL_MASK;
3272 save->dpl = save->selector & SELECTOR_RPL_MASK; 3273 save->dpl = save->selector & SEGMENT_RPL_MASK;
3273 save->s = 1; 3274 save->s = 1;
3274 } 3275 }
3275 vmx_set_segment(vcpu, save, seg); 3276 vmx_set_segment(vcpu, save, seg);
@@ -3842,7 +3843,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
3842 unsigned int cs_rpl; 3843 unsigned int cs_rpl;
3843 3844
3844 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3845 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3845 cs_rpl = cs.selector & SELECTOR_RPL_MASK; 3846 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3846 3847
3847 if (cs.unusable) 3848 if (cs.unusable)
3848 return false; 3849 return false;
@@ -3870,7 +3871,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3870 unsigned int ss_rpl; 3871 unsigned int ss_rpl;
3871 3872
3872 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3873 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3873 ss_rpl = ss.selector & SELECTOR_RPL_MASK; 3874 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3874 3875
3875 if (ss.unusable) 3876 if (ss.unusable)
3876 return true; 3877 return true;
@@ -3892,7 +3893,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3892 unsigned int rpl; 3893 unsigned int rpl;
3893 3894
3894 vmx_get_segment(vcpu, &var, seg); 3895 vmx_get_segment(vcpu, &var, seg);
3895 rpl = var.selector & SELECTOR_RPL_MASK; 3896 rpl = var.selector & SEGMENT_RPL_MASK;
3896 3897
3897 if (var.unusable) 3898 if (var.unusable)
3898 return true; 3899 return true;
@@ -3919,7 +3920,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
3919 3920
3920 if (tr.unusable) 3921 if (tr.unusable)
3921 return false; 3922 return false;
3922 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ 3923 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3923 return false; 3924 return false;
3924 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3925 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3925 return false; 3926 return false;
@@ -3937,7 +3938,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
3937 3938
3938 if (ldtr.unusable) 3939 if (ldtr.unusable)
3939 return true; 3940 return true;
3940 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ 3941 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3941 return false; 3942 return false;
3942 if (ldtr.type != 2) 3943 if (ldtr.type != 2)
3943 return false; 3944 return false;
@@ -3954,8 +3955,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3954 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3955 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3955 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3956 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3956 3957
3957 return ((cs.selector & SELECTOR_RPL_MASK) == 3958 return ((cs.selector & SEGMENT_RPL_MASK) ==
3958 (ss.selector & SELECTOR_RPL_MASK)); 3959 (ss.selector & SEGMENT_RPL_MASK));
3959} 3960}
3960 3961
3961/* 3962/*
@@ -4711,7 +4712,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4711 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4712 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4712 kvm_set_cr8(&vmx->vcpu, 0); 4713 kvm_set_cr8(&vmx->vcpu, 0);
4713 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; 4714 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
4714 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4715 if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
4715 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4716 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4716 apic_base_msr.host_initiated = true; 4717 apic_base_msr.host_initiated = true;
4717 kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); 4718 kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
@@ -5006,7 +5007,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5006 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { 5007 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
5007 if (vcpu->arch.halt_request) { 5008 if (vcpu->arch.halt_request) {
5008 vcpu->arch.halt_request = 0; 5009 vcpu->arch.halt_request = 0;
5009 return kvm_emulate_halt(vcpu); 5010 return kvm_vcpu_halt(vcpu);
5010 } 5011 }
5011 return 1; 5012 return 1;
5012 } 5013 }
@@ -5071,6 +5072,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
5071 } 5072 }
5072 5073
5073 if (is_invalid_opcode(intr_info)) { 5074 if (is_invalid_opcode(intr_info)) {
5075 if (is_guest_mode(vcpu)) {
5076 kvm_queue_exception(vcpu, UD_VECTOR);
5077 return 1;
5078 }
5074 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); 5079 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
5075 if (er != EMULATE_DONE) 5080 if (er != EMULATE_DONE)
5076 kvm_queue_exception(vcpu, UD_VECTOR); 5081 kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5090,9 +5095,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
5090 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5095 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5091 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5096 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5092 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5097 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5093 vcpu->run->internal.ndata = 2; 5098 vcpu->run->internal.ndata = 3;
5094 vcpu->run->internal.data[0] = vect_info; 5099 vcpu->run->internal.data[0] = vect_info;
5095 vcpu->run->internal.data[1] = intr_info; 5100 vcpu->run->internal.data[1] = intr_info;
5101 vcpu->run->internal.data[2] = error_code;
5096 return 0; 5102 return 0;
5097 } 5103 }
5098 5104
@@ -5533,13 +5539,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5533 5539
5534static int handle_halt(struct kvm_vcpu *vcpu) 5540static int handle_halt(struct kvm_vcpu *vcpu)
5535{ 5541{
5536 skip_emulated_instruction(vcpu);
5537 return kvm_emulate_halt(vcpu); 5542 return kvm_emulate_halt(vcpu);
5538} 5543}
5539 5544
5540static int handle_vmcall(struct kvm_vcpu *vcpu) 5545static int handle_vmcall(struct kvm_vcpu *vcpu)
5541{ 5546{
5542 skip_emulated_instruction(vcpu);
5543 kvm_emulate_hypercall(vcpu); 5547 kvm_emulate_hypercall(vcpu);
5544 return 1; 5548 return 1;
5545} 5549}
@@ -5570,7 +5574,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
5570 5574
5571static int handle_wbinvd(struct kvm_vcpu *vcpu) 5575static int handle_wbinvd(struct kvm_vcpu *vcpu)
5572{ 5576{
5573 skip_emulated_instruction(vcpu);
5574 kvm_emulate_wbinvd(vcpu); 5577 kvm_emulate_wbinvd(vcpu);
5575 return 1; 5578 return 1;
5576} 5579}
@@ -5828,7 +5831,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5828 gpa_t gpa; 5831 gpa_t gpa;
5829 5832
5830 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5833 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5831 if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5834 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5832 skip_emulated_instruction(vcpu); 5835 skip_emulated_instruction(vcpu);
5833 return 1; 5836 return 1;
5834 } 5837 }
@@ -5909,7 +5912,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5909 5912
5910 if (vcpu->arch.halt_request) { 5913 if (vcpu->arch.halt_request) {
5911 vcpu->arch.halt_request = 0; 5914 vcpu->arch.halt_request = 0;
5912 ret = kvm_emulate_halt(vcpu); 5915 ret = kvm_vcpu_halt(vcpu);
5913 goto out; 5916 goto out;
5914 } 5917 }
5915 5918
@@ -7318,21 +7321,21 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
7318 else if (port < 0x10000) 7321 else if (port < 0x10000)
7319 bitmap = vmcs12->io_bitmap_b; 7322 bitmap = vmcs12->io_bitmap_b;
7320 else 7323 else
7321 return 1; 7324 return true;
7322 bitmap += (port & 0x7fff) / 8; 7325 bitmap += (port & 0x7fff) / 8;
7323 7326
7324 if (last_bitmap != bitmap) 7327 if (last_bitmap != bitmap)
7325 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) 7328 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
7326 return 1; 7329 return true;
7327 if (b & (1 << (port & 7))) 7330 if (b & (1 << (port & 7)))
7328 return 1; 7331 return true;
7329 7332
7330 port++; 7333 port++;
7331 size--; 7334 size--;
7332 last_bitmap = bitmap; 7335 last_bitmap = bitmap;
7333 } 7336 }
7334 7337
7335 return 0; 7338 return false;
7336} 7339}
7337 7340
7338/* 7341/*
@@ -7348,7 +7351,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
7348 gpa_t bitmap; 7351 gpa_t bitmap;
7349 7352
7350 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 7353 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
7351 return 1; 7354 return true;
7352 7355
7353 /* 7356 /*
7354 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 7357 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
@@ -7367,10 +7370,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
7367 if (msr_index < 1024*8) { 7370 if (msr_index < 1024*8) {
7368 unsigned char b; 7371 unsigned char b;
7369 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) 7372 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
7370 return 1; 7373 return true;
7371 return 1 & (b >> (msr_index & 7)); 7374 return 1 & (b >> (msr_index & 7));
7372 } else 7375 } else
7373 return 1; /* let L1 handle the wrong parameter */ 7376 return true; /* let L1 handle the wrong parameter */
7374} 7377}
7375 7378
7376/* 7379/*
@@ -7392,7 +7395,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
7392 case 0: 7395 case 0:
7393 if (vmcs12->cr0_guest_host_mask & 7396 if (vmcs12->cr0_guest_host_mask &
7394 (val ^ vmcs12->cr0_read_shadow)) 7397 (val ^ vmcs12->cr0_read_shadow))
7395 return 1; 7398 return true;
7396 break; 7399 break;
7397 case 3: 7400 case 3:
7398 if ((vmcs12->cr3_target_count >= 1 && 7401 if ((vmcs12->cr3_target_count >= 1 &&
@@ -7403,37 +7406,37 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
7403 vmcs12->cr3_target_value2 == val) || 7406 vmcs12->cr3_target_value2 == val) ||
7404 (vmcs12->cr3_target_count >= 4 && 7407 (vmcs12->cr3_target_count >= 4 &&
7405 vmcs12->cr3_target_value3 == val)) 7408 vmcs12->cr3_target_value3 == val))
7406 return 0; 7409 return false;
7407 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 7410 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
7408 return 1; 7411 return true;
7409 break; 7412 break;
7410 case 4: 7413 case 4:
7411 if (vmcs12->cr4_guest_host_mask & 7414 if (vmcs12->cr4_guest_host_mask &
7412 (vmcs12->cr4_read_shadow ^ val)) 7415 (vmcs12->cr4_read_shadow ^ val))
7413 return 1; 7416 return true;
7414 break; 7417 break;
7415 case 8: 7418 case 8:
7416 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 7419 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
7417 return 1; 7420 return true;
7418 break; 7421 break;
7419 } 7422 }
7420 break; 7423 break;
7421 case 2: /* clts */ 7424 case 2: /* clts */
7422 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 7425 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
7423 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 7426 (vmcs12->cr0_read_shadow & X86_CR0_TS))
7424 return 1; 7427 return true;
7425 break; 7428 break;
7426 case 1: /* mov from cr */ 7429 case 1: /* mov from cr */
7427 switch (cr) { 7430 switch (cr) {
7428 case 3: 7431 case 3:
7429 if (vmcs12->cpu_based_vm_exec_control & 7432 if (vmcs12->cpu_based_vm_exec_control &
7430 CPU_BASED_CR3_STORE_EXITING) 7433 CPU_BASED_CR3_STORE_EXITING)
7431 return 1; 7434 return true;
7432 break; 7435 break;
7433 case 8: 7436 case 8:
7434 if (vmcs12->cpu_based_vm_exec_control & 7437 if (vmcs12->cpu_based_vm_exec_control &
7435 CPU_BASED_CR8_STORE_EXITING) 7438 CPU_BASED_CR8_STORE_EXITING)
7436 return 1; 7439 return true;
7437 break; 7440 break;
7438 } 7441 }
7439 break; 7442 break;
@@ -7444,14 +7447,14 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
7444 */ 7447 */
7445 if (vmcs12->cr0_guest_host_mask & 0xe & 7448 if (vmcs12->cr0_guest_host_mask & 0xe &
7446 (val ^ vmcs12->cr0_read_shadow)) 7449 (val ^ vmcs12->cr0_read_shadow))
7447 return 1; 7450 return true;
7448 if ((vmcs12->cr0_guest_host_mask & 0x1) && 7451 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
7449 !(vmcs12->cr0_read_shadow & 0x1) && 7452 !(vmcs12->cr0_read_shadow & 0x1) &&
7450 (val & 0x1)) 7453 (val & 0x1))
7451 return 1; 7454 return true;
7452 break; 7455 break;
7453 } 7456 }
7454 return 0; 7457 return false;
7455} 7458}
7456 7459
7457/* 7460/*
@@ -7474,48 +7477,48 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7474 KVM_ISA_VMX); 7477 KVM_ISA_VMX);
7475 7478
7476 if (vmx->nested.nested_run_pending) 7479 if (vmx->nested.nested_run_pending)
7477 return 0; 7480 return false;
7478 7481
7479 if (unlikely(vmx->fail)) { 7482 if (unlikely(vmx->fail)) {
7480 pr_info_ratelimited("%s failed vm entry %x\n", __func__, 7483 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
7481 vmcs_read32(VM_INSTRUCTION_ERROR)); 7484 vmcs_read32(VM_INSTRUCTION_ERROR));
7482 return 1; 7485 return true;
7483 } 7486 }
7484 7487
7485 switch (exit_reason) { 7488 switch (exit_reason) {
7486 case EXIT_REASON_EXCEPTION_NMI: 7489 case EXIT_REASON_EXCEPTION_NMI:
7487 if (!is_exception(intr_info)) 7490 if (!is_exception(intr_info))
7488 return 0; 7491 return false;
7489 else if (is_page_fault(intr_info)) 7492 else if (is_page_fault(intr_info))
7490 return enable_ept; 7493 return enable_ept;
7491 else if (is_no_device(intr_info) && 7494 else if (is_no_device(intr_info) &&
7492 !(vmcs12->guest_cr0 & X86_CR0_TS)) 7495 !(vmcs12->guest_cr0 & X86_CR0_TS))
7493 return 0; 7496 return false;
7494 return vmcs12->exception_bitmap & 7497 return vmcs12->exception_bitmap &
7495 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 7498 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
7496 case EXIT_REASON_EXTERNAL_INTERRUPT: 7499 case EXIT_REASON_EXTERNAL_INTERRUPT:
7497 return 0; 7500 return false;
7498 case EXIT_REASON_TRIPLE_FAULT: 7501 case EXIT_REASON_TRIPLE_FAULT:
7499 return 1; 7502 return true;
7500 case EXIT_REASON_PENDING_INTERRUPT: 7503 case EXIT_REASON_PENDING_INTERRUPT:
7501 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 7504 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
7502 case EXIT_REASON_NMI_WINDOW: 7505 case EXIT_REASON_NMI_WINDOW:
7503 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 7506 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
7504 case EXIT_REASON_TASK_SWITCH: 7507 case EXIT_REASON_TASK_SWITCH:
7505 return 1; 7508 return true;
7506 case EXIT_REASON_CPUID: 7509 case EXIT_REASON_CPUID:
7507 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) 7510 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
7508 return 0; 7511 return false;
7509 return 1; 7512 return true;
7510 case EXIT_REASON_HLT: 7513 case EXIT_REASON_HLT:
7511 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 7514 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
7512 case EXIT_REASON_INVD: 7515 case EXIT_REASON_INVD:
7513 return 1; 7516 return true;
7514 case EXIT_REASON_INVLPG: 7517 case EXIT_REASON_INVLPG:
7515 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 7518 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
7516 case EXIT_REASON_RDPMC: 7519 case EXIT_REASON_RDPMC:
7517 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 7520 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
7518 case EXIT_REASON_RDTSC: 7521 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
7519 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 7522 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
7520 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 7523 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
7521 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 7524 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
@@ -7527,7 +7530,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7527 * VMX instructions trap unconditionally. This allows L1 to 7530 * VMX instructions trap unconditionally. This allows L1 to
7528 * emulate them for its L2 guest, i.e., allows 3-level nesting! 7531 * emulate them for its L2 guest, i.e., allows 3-level nesting!
7529 */ 7532 */
7530 return 1; 7533 return true;
7531 case EXIT_REASON_CR_ACCESS: 7534 case EXIT_REASON_CR_ACCESS:
7532 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 7535 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
7533 case EXIT_REASON_DR_ACCESS: 7536 case EXIT_REASON_DR_ACCESS:
@@ -7538,7 +7541,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7538 case EXIT_REASON_MSR_WRITE: 7541 case EXIT_REASON_MSR_WRITE:
7539 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 7542 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
7540 case EXIT_REASON_INVALID_STATE: 7543 case EXIT_REASON_INVALID_STATE:
7541 return 1; 7544 return true;
7542 case EXIT_REASON_MWAIT_INSTRUCTION: 7545 case EXIT_REASON_MWAIT_INSTRUCTION:
7543 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 7546 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
7544 case EXIT_REASON_MONITOR_INSTRUCTION: 7547 case EXIT_REASON_MONITOR_INSTRUCTION:
@@ -7548,7 +7551,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7548 nested_cpu_has2(vmcs12, 7551 nested_cpu_has2(vmcs12,
7549 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 7552 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
7550 case EXIT_REASON_MCE_DURING_VMENTRY: 7553 case EXIT_REASON_MCE_DURING_VMENTRY:
7551 return 0; 7554 return false;
7552 case EXIT_REASON_TPR_BELOW_THRESHOLD: 7555 case EXIT_REASON_TPR_BELOW_THRESHOLD:
7553 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 7556 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
7554 case EXIT_REASON_APIC_ACCESS: 7557 case EXIT_REASON_APIC_ACCESS:
@@ -7557,7 +7560,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7557 case EXIT_REASON_APIC_WRITE: 7560 case EXIT_REASON_APIC_WRITE:
7558 case EXIT_REASON_EOI_INDUCED: 7561 case EXIT_REASON_EOI_INDUCED:
7559 /* apic_write and eoi_induced should exit unconditionally. */ 7562 /* apic_write and eoi_induced should exit unconditionally. */
7560 return 1; 7563 return true;
7561 case EXIT_REASON_EPT_VIOLATION: 7564 case EXIT_REASON_EPT_VIOLATION:
7562 /* 7565 /*
7563 * L0 always deals with the EPT violation. If nested EPT is 7566 * L0 always deals with the EPT violation. If nested EPT is
@@ -7565,7 +7568,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7565 * missing in the guest EPT table (EPT12), the EPT violation 7568 * missing in the guest EPT table (EPT12), the EPT violation
7566 * will be injected with nested_ept_inject_page_fault() 7569 * will be injected with nested_ept_inject_page_fault()
7567 */ 7570 */
7568 return 0; 7571 return false;
7569 case EXIT_REASON_EPT_MISCONFIG: 7572 case EXIT_REASON_EPT_MISCONFIG:
7570 /* 7573 /*
7571 * L2 never uses directly L1's EPT, but rather L0's own EPT 7574 * L2 never uses directly L1's EPT, but rather L0's own EPT
@@ -7573,11 +7576,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7573 * (EPT on EPT). So any problems with the structure of the 7576 * (EPT on EPT). So any problems with the structure of the
7574 * table is L0's fault. 7577 * table is L0's fault.
7575 */ 7578 */
7576 return 0; 7579 return false;
7577 case EXIT_REASON_WBINVD: 7580 case EXIT_REASON_WBINVD:
7578 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 7581 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
7579 case EXIT_REASON_XSETBV: 7582 case EXIT_REASON_XSETBV:
7580 return 1; 7583 return true;
7581 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 7584 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
7582 /* 7585 /*
7583 * This should never happen, since it is not possible to 7586 * This should never happen, since it is not possible to
@@ -7587,7 +7590,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7587 */ 7590 */
7588 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 7591 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
7589 default: 7592 default:
7590 return 1; 7593 return true;
7591 } 7594 }
7592} 7595}
7593 7596
@@ -8522,6 +8525,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
8522 exec_control); 8525 exec_control);
8523 } 8526 }
8524 } 8527 }
8528 if (nested && !vmx->rdtscp_enabled)
8529 vmx->nested.nested_vmx_secondary_ctls_high &=
8530 ~SECONDARY_EXEC_RDTSCP;
8525 } 8531 }
8526 8532
8527 /* Exposing INVPCID only when PCID is exposed */ 8533 /* Exposing INVPCID only when PCID is exposed */
@@ -8622,10 +8628,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8622 struct vmcs12 *vmcs12) 8628 struct vmcs12 *vmcs12)
8623{ 8629{
8624 struct vcpu_vmx *vmx = to_vmx(vcpu); 8630 struct vcpu_vmx *vmx = to_vmx(vcpu);
8631 int maxphyaddr = cpuid_maxphyaddr(vcpu);
8625 8632
8626 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 8633 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
8627 /* TODO: Also verify bits beyond physical address width are 0 */ 8634 if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
8628 if (!PAGE_ALIGNED(vmcs12->apic_access_addr)) 8635 vmcs12->apic_access_addr >> maxphyaddr)
8629 return false; 8636 return false;
8630 8637
8631 /* 8638 /*
@@ -8641,8 +8648,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8641 } 8648 }
8642 8649
8643 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 8650 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
8644 /* TODO: Also verify bits beyond physical address width are 0 */ 8651 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
8645 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr)) 8652 vmcs12->virtual_apic_page_addr >> maxphyaddr)
8646 return false; 8653 return false;
8647 8654
8648 if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 8655 if (vmx->nested.virtual_apic_page) /* shouldn't happen */
@@ -8665,7 +8672,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8665 } 8672 }
8666 8673
8667 if (nested_cpu_has_posted_intr(vmcs12)) { 8674 if (nested_cpu_has_posted_intr(vmcs12)) {
8668 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64)) 8675 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
8676 vmcs12->posted_intr_desc_addr >> maxphyaddr)
8669 return false; 8677 return false;
8670 8678
8671 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 8679 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
@@ -8864,9 +8872,9 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
8864 8872
8865static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 8873static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8866 unsigned long count_field, 8874 unsigned long count_field,
8867 unsigned long addr_field, 8875 unsigned long addr_field)
8868 int maxphyaddr)
8869{ 8876{
8877 int maxphyaddr;
8870 u64 count, addr; 8878 u64 count, addr;
8871 8879
8872 if (vmcs12_read_any(vcpu, count_field, &count) || 8880 if (vmcs12_read_any(vcpu, count_field, &count) ||
@@ -8876,6 +8884,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8876 } 8884 }
8877 if (count == 0) 8885 if (count == 0)
8878 return 0; 8886 return 0;
8887 maxphyaddr = cpuid_maxphyaddr(vcpu);
8879 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 8888 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
8880 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { 8889 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
8881 pr_warn_ratelimited( 8890 pr_warn_ratelimited(
@@ -8889,19 +8898,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8889static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, 8898static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
8890 struct vmcs12 *vmcs12) 8899 struct vmcs12 *vmcs12)
8891{ 8900{
8892 int maxphyaddr;
8893
8894 if (vmcs12->vm_exit_msr_load_count == 0 && 8901 if (vmcs12->vm_exit_msr_load_count == 0 &&
8895 vmcs12->vm_exit_msr_store_count == 0 && 8902 vmcs12->vm_exit_msr_store_count == 0 &&
8896 vmcs12->vm_entry_msr_load_count == 0) 8903 vmcs12->vm_entry_msr_load_count == 0)
8897 return 0; /* Fast path */ 8904 return 0; /* Fast path */
8898 maxphyaddr = cpuid_maxphyaddr(vcpu);
8899 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, 8905 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
8900 VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || 8906 VM_EXIT_MSR_LOAD_ADDR) ||
8901 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, 8907 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
8902 VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || 8908 VM_EXIT_MSR_STORE_ADDR) ||
8903 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, 8909 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
8904 VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) 8910 VM_ENTRY_MSR_LOAD_ADDR))
8905 return -EINVAL; 8911 return -EINVAL;
8906 return 0; 8912 return 0;
8907} 8913}
@@ -9151,8 +9157,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9151 exec_control &= ~SECONDARY_EXEC_RDTSCP; 9157 exec_control &= ~SECONDARY_EXEC_RDTSCP;
9152 /* Take the following fields only from vmcs12 */ 9158 /* Take the following fields only from vmcs12 */
9153 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 9159 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
9160 SECONDARY_EXEC_RDTSCP |
9154 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 9161 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
9155 SECONDARY_EXEC_APIC_REGISTER_VIRT); 9162 SECONDARY_EXEC_APIC_REGISTER_VIRT);
9156 if (nested_cpu_has(vmcs12, 9163 if (nested_cpu_has(vmcs12,
9157 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 9164 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
9158 exec_control |= vmcs12->secondary_vm_exec_control; 9165 exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9385,7 +9392,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
9385 } 9392 }
9386 9393
9387 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 9394 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
9388 /*TODO: Also verify bits beyond physical address width are 0*/
9389 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9395 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9390 return 1; 9396 return 1;
9391 } 9397 }
@@ -9524,7 +9530,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
9524 vmcs12->launch_state = 1; 9530 vmcs12->launch_state = 1;
9525 9531
9526 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) 9532 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
9527 return kvm_emulate_halt(vcpu); 9533 return kvm_vcpu_halt(vcpu);
9528 9534
9529 vmx->nested.nested_run_pending = 1; 9535 vmx->nested.nested_run_pending = 1;
9530 9536
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32bf19ef3115..e1a81267f3f6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -801,6 +801,17 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
801} 801}
802EXPORT_SYMBOL_GPL(kvm_get_cr8); 802EXPORT_SYMBOL_GPL(kvm_get_cr8);
803 803
804static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
805{
806 int i;
807
808 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
809 for (i = 0; i < KVM_NR_DB_REGS; i++)
810 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
811 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
812 }
813}
814
804static void kvm_update_dr6(struct kvm_vcpu *vcpu) 815static void kvm_update_dr6(struct kvm_vcpu *vcpu)
805{ 816{
806 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 817 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
@@ -1070,19 +1081,19 @@ static void update_pvclock_gtod(struct timekeeper *tk)
1070 struct pvclock_gtod_data *vdata = &pvclock_gtod_data; 1081 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1071 u64 boot_ns; 1082 u64 boot_ns;
1072 1083
1073 boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot)); 1084 boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1074 1085
1075 write_seqcount_begin(&vdata->seq); 1086 write_seqcount_begin(&vdata->seq);
1076 1087
1077 /* copy pvclock gtod data */ 1088 /* copy pvclock gtod data */
1078 vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode; 1089 vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
1079 vdata->clock.cycle_last = tk->tkr.cycle_last; 1090 vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
1080 vdata->clock.mask = tk->tkr.mask; 1091 vdata->clock.mask = tk->tkr_mono.mask;
1081 vdata->clock.mult = tk->tkr.mult; 1092 vdata->clock.mult = tk->tkr_mono.mult;
1082 vdata->clock.shift = tk->tkr.shift; 1093 vdata->clock.shift = tk->tkr_mono.shift;
1083 1094
1084 vdata->boot_ns = boot_ns; 1095 vdata->boot_ns = boot_ns;
1085 vdata->nsec_base = tk->tkr.xtime_nsec; 1096 vdata->nsec_base = tk->tkr_mono.xtime_nsec;
1086 1097
1087 write_seqcount_end(&vdata->seq); 1098 write_seqcount_end(&vdata->seq);
1088} 1099}
@@ -3149,6 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3149 return -EINVAL; 3160 return -EINVAL;
3150 3161
3151 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 3162 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3163 kvm_update_dr0123(vcpu);
3152 vcpu->arch.dr6 = dbgregs->dr6; 3164 vcpu->arch.dr6 = dbgregs->dr6;
3153 kvm_update_dr6(vcpu); 3165 kvm_update_dr6(vcpu);
3154 vcpu->arch.dr7 = dbgregs->dr7; 3166 vcpu->arch.dr7 = dbgregs->dr7;
@@ -4114,8 +4126,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
4114 do { 4126 do {
4115 n = min(len, 8); 4127 n = min(len, 8);
4116 if (!(vcpu->arch.apic && 4128 if (!(vcpu->arch.apic &&
4117 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) 4129 !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
4118 && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) 4130 && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
4119 break; 4131 break;
4120 handled += n; 4132 handled += n;
4121 addr += n; 4133 addr += n;
@@ -4134,8 +4146,9 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
4134 do { 4146 do {
4135 n = min(len, 8); 4147 n = min(len, 8);
4136 if (!(vcpu->arch.apic && 4148 if (!(vcpu->arch.apic &&
4137 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) 4149 !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
4138 && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) 4150 addr, n, v))
4151 && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
4139 break; 4152 break;
4140 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); 4153 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
4141 handled += n; 4154 handled += n;
@@ -4475,7 +4488,8 @@ mmio:
4475 return X86EMUL_CONTINUE; 4488 return X86EMUL_CONTINUE;
4476} 4489}
4477 4490
4478int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 4491static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
4492 unsigned long addr,
4479 void *val, unsigned int bytes, 4493 void *val, unsigned int bytes,
4480 struct x86_exception *exception, 4494 struct x86_exception *exception,
4481 const struct read_write_emulator_ops *ops) 4495 const struct read_write_emulator_ops *ops)
@@ -4538,7 +4552,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4538 exception, &read_emultor); 4552 exception, &read_emultor);
4539} 4553}
4540 4554
4541int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, 4555static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4542 unsigned long addr, 4556 unsigned long addr,
4543 const void *val, 4557 const void *val,
4544 unsigned int bytes, 4558 unsigned int bytes,
@@ -4629,10 +4643,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4629 int r; 4643 int r;
4630 4644
4631 if (vcpu->arch.pio.in) 4645 if (vcpu->arch.pio.in)
4632 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, 4646 r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
4633 vcpu->arch.pio.size, pd); 4647 vcpu->arch.pio.size, pd);
4634 else 4648 else
4635 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, 4649 r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
4636 vcpu->arch.pio.port, vcpu->arch.pio.size, 4650 vcpu->arch.pio.port, vcpu->arch.pio.size,
4637 pd); 4651 pd);
4638 return r; 4652 return r;
@@ -4705,7 +4719,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4705 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); 4719 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4706} 4720}
4707 4721
4708int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 4722int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
4709{ 4723{
4710 if (!need_emulate_wbinvd(vcpu)) 4724 if (!need_emulate_wbinvd(vcpu))
4711 return X86EMUL_CONTINUE; 4725 return X86EMUL_CONTINUE;
@@ -4722,19 +4736,29 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4722 wbinvd(); 4736 wbinvd();
4723 return X86EMUL_CONTINUE; 4737 return X86EMUL_CONTINUE;
4724} 4738}
4739
4740int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4741{
4742 kvm_x86_ops->skip_emulated_instruction(vcpu);
4743 return kvm_emulate_wbinvd_noskip(vcpu);
4744}
4725EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4745EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4726 4746
4747
4748
4727static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) 4749static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4728{ 4750{
4729 kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); 4751 kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
4730} 4752}
4731 4753
4732int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 4754static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
4755 unsigned long *dest)
4733{ 4756{
4734 return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); 4757 return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4735} 4758}
4736 4759
4737int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 4760static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
4761 unsigned long value)
4738{ 4762{
4739 4763
4740 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); 4764 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
@@ -5816,7 +5840,7 @@ void kvm_arch_exit(void)
5816 free_percpu(shared_msrs); 5840 free_percpu(shared_msrs);
5817} 5841}
5818 5842
5819int kvm_emulate_halt(struct kvm_vcpu *vcpu) 5843int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
5820{ 5844{
5821 ++vcpu->stat.halt_exits; 5845 ++vcpu->stat.halt_exits;
5822 if (irqchip_in_kernel(vcpu->kvm)) { 5846 if (irqchip_in_kernel(vcpu->kvm)) {
@@ -5827,6 +5851,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5827 return 0; 5851 return 0;
5828 } 5852 }
5829} 5853}
5854EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
5855
5856int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5857{
5858 kvm_x86_ops->skip_emulated_instruction(vcpu);
5859 return kvm_vcpu_halt(vcpu);
5860}
5830EXPORT_SYMBOL_GPL(kvm_emulate_halt); 5861EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5831 5862
5832int kvm_hv_hypercall(struct kvm_vcpu *vcpu) 5863int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
@@ -5903,7 +5934,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5903 lapic_irq.dest_id = apicid; 5934 lapic_irq.dest_id = apicid;
5904 5935
5905 lapic_irq.delivery_mode = APIC_DM_REMRD; 5936 lapic_irq.delivery_mode = APIC_DM_REMRD;
5906 kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); 5937 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
5907} 5938}
5908 5939
5909int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 5940int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -5911,6 +5942,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5911 unsigned long nr, a0, a1, a2, a3, ret; 5942 unsigned long nr, a0, a1, a2, a3, ret;
5912 int op_64_bit, r = 1; 5943 int op_64_bit, r = 1;
5913 5944
5945 kvm_x86_ops->skip_emulated_instruction(vcpu);
5946
5914 if (kvm_hv_hypercall_enabled(vcpu->kvm)) 5947 if (kvm_hv_hypercall_enabled(vcpu->kvm))
5915 return kvm_hv_hypercall(vcpu); 5948 return kvm_hv_hypercall(vcpu);
5916 5949
@@ -6164,7 +6197,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
6164} 6197}
6165 6198
6166/* 6199/*
6167 * Returns 1 to let __vcpu_run() continue the guest execution loop without 6200 * Returns 1 to let vcpu_run() continue the guest execution loop without
6168 * exiting to the userspace. Otherwise, the value will be returned to the 6201 * exiting to the userspace. Otherwise, the value will be returned to the
6169 * userspace. 6202 * userspace.
6170 */ 6203 */
@@ -6301,6 +6334,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6301 set_debugreg(vcpu->arch.eff_db[2], 2); 6334 set_debugreg(vcpu->arch.eff_db[2], 2);
6302 set_debugreg(vcpu->arch.eff_db[3], 3); 6335 set_debugreg(vcpu->arch.eff_db[3], 3);
6303 set_debugreg(vcpu->arch.dr6, 6); 6336 set_debugreg(vcpu->arch.dr6, 6);
6337 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
6304 } 6338 }
6305 6339
6306 trace_kvm_entry(vcpu->vcpu_id); 6340 trace_kvm_entry(vcpu->vcpu_id);
@@ -6382,42 +6416,47 @@ out:
6382 return r; 6416 return r;
6383} 6417}
6384 6418
6419static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
6420{
6421 if (!kvm_arch_vcpu_runnable(vcpu)) {
6422 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6423 kvm_vcpu_block(vcpu);
6424 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6425 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
6426 return 1;
6427 }
6428
6429 kvm_apic_accept_events(vcpu);
6430 switch(vcpu->arch.mp_state) {
6431 case KVM_MP_STATE_HALTED:
6432 vcpu->arch.pv.pv_unhalted = false;
6433 vcpu->arch.mp_state =
6434 KVM_MP_STATE_RUNNABLE;
6435 case KVM_MP_STATE_RUNNABLE:
6436 vcpu->arch.apf.halted = false;
6437 break;
6438 case KVM_MP_STATE_INIT_RECEIVED:
6439 break;
6440 default:
6441 return -EINTR;
6442 break;
6443 }
6444 return 1;
6445}
6385 6446
6386static int __vcpu_run(struct kvm_vcpu *vcpu) 6447static int vcpu_run(struct kvm_vcpu *vcpu)
6387{ 6448{
6388 int r; 6449 int r;
6389 struct kvm *kvm = vcpu->kvm; 6450 struct kvm *kvm = vcpu->kvm;
6390 6451
6391 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6452 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6392 6453
6393 r = 1; 6454 for (;;) {
6394 while (r > 0) {
6395 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 6455 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6396 !vcpu->arch.apf.halted) 6456 !vcpu->arch.apf.halted)
6397 r = vcpu_enter_guest(vcpu); 6457 r = vcpu_enter_guest(vcpu);
6398 else { 6458 else
6399 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6459 r = vcpu_block(kvm, vcpu);
6400 kvm_vcpu_block(vcpu);
6401 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6402 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
6403 kvm_apic_accept_events(vcpu);
6404 switch(vcpu->arch.mp_state) {
6405 case KVM_MP_STATE_HALTED:
6406 vcpu->arch.pv.pv_unhalted = false;
6407 vcpu->arch.mp_state =
6408 KVM_MP_STATE_RUNNABLE;
6409 case KVM_MP_STATE_RUNNABLE:
6410 vcpu->arch.apf.halted = false;
6411 break;
6412 case KVM_MP_STATE_INIT_RECEIVED:
6413 break;
6414 default:
6415 r = -EINTR;
6416 break;
6417 }
6418 }
6419 }
6420
6421 if (r <= 0) 6460 if (r <= 0)
6422 break; 6461 break;
6423 6462
@@ -6429,6 +6468,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
6429 r = -EINTR; 6468 r = -EINTR;
6430 vcpu->run->exit_reason = KVM_EXIT_INTR; 6469 vcpu->run->exit_reason = KVM_EXIT_INTR;
6431 ++vcpu->stat.request_irq_exits; 6470 ++vcpu->stat.request_irq_exits;
6471 break;
6432 } 6472 }
6433 6473
6434 kvm_check_async_pf_completion(vcpu); 6474 kvm_check_async_pf_completion(vcpu);
@@ -6437,6 +6477,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
6437 r = -EINTR; 6477 r = -EINTR;
6438 vcpu->run->exit_reason = KVM_EXIT_INTR; 6478 vcpu->run->exit_reason = KVM_EXIT_INTR;
6439 ++vcpu->stat.signal_exits; 6479 ++vcpu->stat.signal_exits;
6480 break;
6440 } 6481 }
6441 if (need_resched()) { 6482 if (need_resched()) {
6442 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6483 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -6568,7 +6609,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6568 } else 6609 } else
6569 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); 6610 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
6570 6611
6571 r = __vcpu_run(vcpu); 6612 r = vcpu_run(vcpu);
6572 6613
6573out: 6614out:
6574 post_kvm_run_save(vcpu); 6615 post_kvm_run_save(vcpu);
@@ -7075,11 +7116,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
7075 kvm_clear_exception_queue(vcpu); 7116 kvm_clear_exception_queue(vcpu);
7076 7117
7077 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 7118 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
7119 kvm_update_dr0123(vcpu);
7078 vcpu->arch.dr6 = DR6_INIT; 7120 vcpu->arch.dr6 = DR6_INIT;
7079 kvm_update_dr6(vcpu); 7121 kvm_update_dr6(vcpu);
7080 vcpu->arch.dr7 = DR7_FIXED_1; 7122 vcpu->arch.dr7 = DR7_FIXED_1;
7081 kvm_update_dr7(vcpu); 7123 kvm_update_dr7(vcpu);
7082 7124
7125 vcpu->arch.cr2 = 0;
7126
7083 kvm_make_request(KVM_REQ_EVENT, vcpu); 7127 kvm_make_request(KVM_REQ_EVENT, vcpu);
7084 vcpu->arch.apf.msr_val = 0; 7128 vcpu->arch.apf.msr_val = 0;
7085 vcpu->arch.st.msr_val = 0; 7129 vcpu->arch.st.msr_val = 0;
@@ -7240,7 +7284,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7240 7284
7241 vcpu->arch.pv.pv_unhalted = false; 7285 vcpu->arch.pv.pv_unhalted = false;
7242 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 7286 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
7243 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 7287 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
7244 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 7288 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7245 else 7289 else
7246 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 7290 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -7288,6 +7332,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7288 vcpu->arch.guest_supported_xcr0 = 0; 7332 vcpu->arch.guest_supported_xcr0 = 0;
7289 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; 7333 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
7290 7334
7335 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
7336
7291 kvm_async_pf_hash_reset(vcpu); 7337 kvm_async_pf_hash_reset(vcpu);
7292 kvm_pmu_init(vcpu); 7338 kvm_pmu_init(vcpu);
7293 7339
@@ -7428,7 +7474,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
7428 7474
7429 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 7475 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7430 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { 7476 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
7431 kvm_kvfree(free->arch.rmap[i]); 7477 kvfree(free->arch.rmap[i]);
7432 free->arch.rmap[i] = NULL; 7478 free->arch.rmap[i] = NULL;
7433 } 7479 }
7434 if (i == 0) 7480 if (i == 0)
@@ -7436,7 +7482,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
7436 7482
7437 if (!dont || free->arch.lpage_info[i - 1] != 7483 if (!dont || free->arch.lpage_info[i - 1] !=
7438 dont->arch.lpage_info[i - 1]) { 7484 dont->arch.lpage_info[i - 1]) {
7439 kvm_kvfree(free->arch.lpage_info[i - 1]); 7485 kvfree(free->arch.lpage_info[i - 1]);
7440 free->arch.lpage_info[i - 1] = NULL; 7486 free->arch.lpage_info[i - 1] = NULL;
7441 } 7487 }
7442 } 7488 }
@@ -7490,12 +7536,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
7490 7536
7491out_free: 7537out_free:
7492 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 7538 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7493 kvm_kvfree(slot->arch.rmap[i]); 7539 kvfree(slot->arch.rmap[i]);
7494 slot->arch.rmap[i] = NULL; 7540 slot->arch.rmap[i] = NULL;
7495 if (i == 0) 7541 if (i == 0)
7496 continue; 7542 continue;
7497 7543
7498 kvm_kvfree(slot->arch.lpage_info[i - 1]); 7544 kvfree(slot->arch.lpage_info[i - 1]);
7499 slot->arch.lpage_info[i - 1] = NULL; 7545 slot->arch.lpage_info[i - 1] = NULL;
7500 } 7546 }
7501 return -ENOMEM; 7547 return -ENOMEM;
@@ -7618,6 +7664,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7618 new = id_to_memslot(kvm->memslots, mem->slot); 7664 new = id_to_memslot(kvm->memslots, mem->slot);
7619 7665
7620 /* 7666 /*
7667 * Dirty logging tracks sptes in 4k granularity, meaning that large
7668 * sptes have to be split. If live migration is successful, the guest
7669 * in the source machine will be destroyed and large sptes will be
7670 * created in the destination. However, if the guest continues to run
7671 * in the source machine (for example if live migration fails), small
7672 * sptes will remain around and cause bad performance.
7673 *
7674 * Scan sptes if dirty logging has been stopped, dropping those
7675 * which can be collapsed into a single large-page spte. Later
7676 * page faults will create the large-page sptes.
7677 */
7678 if ((change != KVM_MR_DELETE) &&
7679 (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
7680 !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
7681 kvm_mmu_zap_collapsible_sptes(kvm, new);
7682
7683 /*
7621 * Set up write protection and/or dirty logging for the new slot. 7684 * Set up write protection and/or dirty logging for the new slot.
7622 * 7685 *
7623 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have 7686 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ac4453d8520e..717908b16037 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void)
868 /* Some systems map "vectors" to interrupts weirdly. Not us! */ 868 /* Some systems map "vectors" to interrupts weirdly. Not us! */
869 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); 869 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
870 if (i != SYSCALL_VECTOR) 870 if (i != SYSCALL_VECTOR)
871 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 871 set_intr_gate(i, irq_entries_start +
872 8 * (i - FIRST_EXTERNAL_VECTOR));
872 } 873 }
873 874
874 /* 875 /*
@@ -1076,6 +1077,7 @@ static void lguest_load_sp0(struct tss_struct *tss,
1076{ 1077{
1077 lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, 1078 lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
1078 THREAD_SIZE / PAGE_SIZE); 1079 THREAD_SIZE / PAGE_SIZE);
1080 tss->x86_tss.sp0 = thread->sp0;
1079} 1081}
1080 1082
1081/* Let's just say, I wouldn't do debugging under a Guest. */ 1083/* Let's just say, I wouldn't do debugging under a Guest. */
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index f5cc9eb1d51b..082a85167a5b 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -13,16 +13,6 @@
13#include <asm/alternative-asm.h> 13#include <asm/alternative-asm.h>
14#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
15 15
16.macro SAVE reg
17 pushl_cfi %\reg
18 CFI_REL_OFFSET \reg, 0
19.endm
20
21.macro RESTORE reg
22 popl_cfi %\reg
23 CFI_RESTORE \reg
24.endm
25
26.macro read64 reg 16.macro read64 reg
27 movl %ebx, %eax 17 movl %ebx, %eax
28 movl %ecx, %edx 18 movl %ecx, %edx
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8)
67.macro addsub_return func ins insc 57.macro addsub_return func ins insc
68ENTRY(atomic64_\func\()_return_cx8) 58ENTRY(atomic64_\func\()_return_cx8)
69 CFI_STARTPROC 59 CFI_STARTPROC
70 SAVE ebp 60 pushl_cfi_reg ebp
71 SAVE ebx 61 pushl_cfi_reg ebx
72 SAVE esi 62 pushl_cfi_reg esi
73 SAVE edi 63 pushl_cfi_reg edi
74 64
75 movl %eax, %esi 65 movl %eax, %esi
76 movl %edx, %edi 66 movl %edx, %edi
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8)
8910: 7910:
90 movl %ebx, %eax 80 movl %ebx, %eax
91 movl %ecx, %edx 81 movl %ecx, %edx
92 RESTORE edi 82 popl_cfi_reg edi
93 RESTORE esi 83 popl_cfi_reg esi
94 RESTORE ebx 84 popl_cfi_reg ebx
95 RESTORE ebp 85 popl_cfi_reg ebp
96 ret 86 ret
97 CFI_ENDPROC 87 CFI_ENDPROC
98ENDPROC(atomic64_\func\()_return_cx8) 88ENDPROC(atomic64_\func\()_return_cx8)
@@ -104,7 +94,7 @@ addsub_return sub sub sbb
104.macro incdec_return func ins insc 94.macro incdec_return func ins insc
105ENTRY(atomic64_\func\()_return_cx8) 95ENTRY(atomic64_\func\()_return_cx8)
106 CFI_STARTPROC 96 CFI_STARTPROC
107 SAVE ebx 97 pushl_cfi_reg ebx
108 98
109 read64 %esi 99 read64 %esi
1101: 1001:
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8)
11910: 10910:
120 movl %ebx, %eax 110 movl %ebx, %eax
121 movl %ecx, %edx 111 movl %ecx, %edx
122 RESTORE ebx 112 popl_cfi_reg ebx
123 ret 113 ret
124 CFI_ENDPROC 114 CFI_ENDPROC
125ENDPROC(atomic64_\func\()_return_cx8) 115ENDPROC(atomic64_\func\()_return_cx8)
@@ -130,7 +120,7 @@ incdec_return dec sub sbb
130 120
131ENTRY(atomic64_dec_if_positive_cx8) 121ENTRY(atomic64_dec_if_positive_cx8)
132 CFI_STARTPROC 122 CFI_STARTPROC
133 SAVE ebx 123 pushl_cfi_reg ebx
134 124
135 read64 %esi 125 read64 %esi
1361: 1261:
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8)
1462: 1362:
147 movl %ebx, %eax 137 movl %ebx, %eax
148 movl %ecx, %edx 138 movl %ecx, %edx
149 RESTORE ebx 139 popl_cfi_reg ebx
150 ret 140 ret
151 CFI_ENDPROC 141 CFI_ENDPROC
152ENDPROC(atomic64_dec_if_positive_cx8) 142ENDPROC(atomic64_dec_if_positive_cx8)
153 143
154ENTRY(atomic64_add_unless_cx8) 144ENTRY(atomic64_add_unless_cx8)
155 CFI_STARTPROC 145 CFI_STARTPROC
156 SAVE ebp 146 pushl_cfi_reg ebp
157 SAVE ebx 147 pushl_cfi_reg ebx
158/* these just push these two parameters on the stack */ 148/* these just push these two parameters on the stack */
159 SAVE edi 149 pushl_cfi_reg edi
160 SAVE ecx 150 pushl_cfi_reg ecx
161 151
162 movl %eax, %ebp 152 movl %eax, %ebp
163 movl %edx, %edi 153 movl %edx, %edi
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8)
1793: 1693:
180 addl $8, %esp 170 addl $8, %esp
181 CFI_ADJUST_CFA_OFFSET -8 171 CFI_ADJUST_CFA_OFFSET -8
182 RESTORE ebx 172 popl_cfi_reg ebx
183 RESTORE ebp 173 popl_cfi_reg ebp
184 ret 174 ret
1854: 1754:
186 cmpl %edx, 4(%esp) 176 cmpl %edx, 4(%esp)
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8)
192 182
193ENTRY(atomic64_inc_not_zero_cx8) 183ENTRY(atomic64_inc_not_zero_cx8)
194 CFI_STARTPROC 184 CFI_STARTPROC
195 SAVE ebx 185 pushl_cfi_reg ebx
196 186
197 read64 %esi 187 read64 %esi
1981: 1881:
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8)
209 199
210 movl $1, %eax 200 movl $1, %eax
2113: 2013:
212 RESTORE ebx 202 popl_cfi_reg ebx
213 ret 203 ret
214 CFI_ENDPROC 204 CFI_ENDPROC
215ENDPROC(atomic64_inc_not_zero_cx8) 205ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index e78b8eee6615..9bc944a91274 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
51 */ 51 */
52ENTRY(csum_partial) 52ENTRY(csum_partial)
53 CFI_STARTPROC 53 CFI_STARTPROC
54 pushl_cfi %esi 54 pushl_cfi_reg esi
55 CFI_REL_OFFSET esi, 0 55 pushl_cfi_reg ebx
56 pushl_cfi %ebx
57 CFI_REL_OFFSET ebx, 0
58 movl 20(%esp),%eax # Function arg: unsigned int sum 56 movl 20(%esp),%eax # Function arg: unsigned int sum
59 movl 16(%esp),%ecx # Function arg: int len 57 movl 16(%esp),%ecx # Function arg: int len
60 movl 12(%esp),%esi # Function arg: unsigned char *buff 58 movl 12(%esp),%esi # Function arg: unsigned char *buff
@@ -127,14 +125,12 @@ ENTRY(csum_partial)
1276: addl %ecx,%eax 1256: addl %ecx,%eax
128 adcl $0, %eax 126 adcl $0, %eax
1297: 1277:
130 testl $1, 12(%esp) 128 testb $1, 12(%esp)
131 jz 8f 129 jz 8f
132 roll $8, %eax 130 roll $8, %eax
1338: 1318:
134 popl_cfi %ebx 132 popl_cfi_reg ebx
135 CFI_RESTORE ebx 133 popl_cfi_reg esi
136 popl_cfi %esi
137 CFI_RESTORE esi
138 ret 134 ret
139 CFI_ENDPROC 135 CFI_ENDPROC
140ENDPROC(csum_partial) 136ENDPROC(csum_partial)
@@ -145,10 +141,8 @@ ENDPROC(csum_partial)
145 141
146ENTRY(csum_partial) 142ENTRY(csum_partial)
147 CFI_STARTPROC 143 CFI_STARTPROC
148 pushl_cfi %esi 144 pushl_cfi_reg esi
149 CFI_REL_OFFSET esi, 0 145 pushl_cfi_reg ebx
150 pushl_cfi %ebx
151 CFI_REL_OFFSET ebx, 0
152 movl 20(%esp),%eax # Function arg: unsigned int sum 146 movl 20(%esp),%eax # Function arg: unsigned int sum
153 movl 16(%esp),%ecx # Function arg: int len 147 movl 16(%esp),%ecx # Function arg: int len
154 movl 12(%esp),%esi # Function arg: const unsigned char *buf 148 movl 12(%esp),%esi # Function arg: const unsigned char *buf
@@ -251,14 +245,12 @@ ENTRY(csum_partial)
251 addl %ebx,%eax 245 addl %ebx,%eax
252 adcl $0,%eax 246 adcl $0,%eax
25380: 24780:
254 testl $1, 12(%esp) 248 testb $1, 12(%esp)
255 jz 90f 249 jz 90f
256 roll $8, %eax 250 roll $8, %eax
25790: 25190:
258 popl_cfi %ebx 252 popl_cfi_reg ebx
259 CFI_RESTORE ebx 253 popl_cfi_reg esi
260 popl_cfi %esi
261 CFI_RESTORE esi
262 ret 254 ret
263 CFI_ENDPROC 255 CFI_ENDPROC
264ENDPROC(csum_partial) 256ENDPROC(csum_partial)
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic)
298 CFI_STARTPROC 290 CFI_STARTPROC
299 subl $4,%esp 291 subl $4,%esp
300 CFI_ADJUST_CFA_OFFSET 4 292 CFI_ADJUST_CFA_OFFSET 4
301 pushl_cfi %edi 293 pushl_cfi_reg edi
302 CFI_REL_OFFSET edi, 0 294 pushl_cfi_reg esi
303 pushl_cfi %esi 295 pushl_cfi_reg ebx
304 CFI_REL_OFFSET esi, 0
305 pushl_cfi %ebx
306 CFI_REL_OFFSET ebx, 0
307 movl ARGBASE+16(%esp),%eax # sum 296 movl ARGBASE+16(%esp),%eax # sum
308 movl ARGBASE+12(%esp),%ecx # len 297 movl ARGBASE+12(%esp),%ecx # len
309 movl ARGBASE+4(%esp),%esi # src 298 movl ARGBASE+4(%esp),%esi # src
@@ -412,12 +401,9 @@ DST( movb %cl, (%edi) )
412 401
413.previous 402.previous
414 403
415 popl_cfi %ebx 404 popl_cfi_reg ebx
416 CFI_RESTORE ebx 405 popl_cfi_reg esi
417 popl_cfi %esi 406 popl_cfi_reg edi
418 CFI_RESTORE esi
419 popl_cfi %edi
420 CFI_RESTORE edi
421 popl_cfi %ecx # equivalent to addl $4,%esp 407 popl_cfi %ecx # equivalent to addl $4,%esp
422 ret 408 ret
423 CFI_ENDPROC 409 CFI_ENDPROC
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic)
441 427
442ENTRY(csum_partial_copy_generic) 428ENTRY(csum_partial_copy_generic)
443 CFI_STARTPROC 429 CFI_STARTPROC
444 pushl_cfi %ebx 430 pushl_cfi_reg ebx
445 CFI_REL_OFFSET ebx, 0 431 pushl_cfi_reg edi
446 pushl_cfi %edi 432 pushl_cfi_reg esi
447 CFI_REL_OFFSET edi, 0
448 pushl_cfi %esi
449 CFI_REL_OFFSET esi, 0
450 movl ARGBASE+4(%esp),%esi #src 433 movl ARGBASE+4(%esp),%esi #src
451 movl ARGBASE+8(%esp),%edi #dst 434 movl ARGBASE+8(%esp),%edi #dst
452 movl ARGBASE+12(%esp),%ecx #len 435 movl ARGBASE+12(%esp),%ecx #len
@@ -506,12 +489,9 @@ DST( movb %dl, (%edi) )
506 jmp 7b 489 jmp 7b
507.previous 490.previous
508 491
509 popl_cfi %esi 492 popl_cfi_reg esi
510 CFI_RESTORE esi 493 popl_cfi_reg edi
511 popl_cfi %edi 494 popl_cfi_reg ebx
512 CFI_RESTORE edi
513 popl_cfi %ebx
514 CFI_RESTORE ebx
515 ret 495 ret
516 CFI_ENDPROC 496 CFI_ENDPROC
517ENDPROC(csum_partial_copy_generic) 497ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cfa12a6..e67e579c93bd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <asm/dwarf2.h> 2#include <asm/dwarf2.h>
3#include <asm/cpufeature.h>
3#include <asm/alternative-asm.h> 4#include <asm/alternative-asm.h>
4 5
5/* 6/*
6 * Zero a page. 7 * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
7 * rdi page 8 * recommended to use this when possible and we do use them by default.
8 */ 9 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
9ENTRY(clear_page_c) 10 * Otherwise, use original.
11 */
12
13/*
14 * Zero a page.
15 * %rdi - page
16 */
17ENTRY(clear_page)
10 CFI_STARTPROC 18 CFI_STARTPROC
19
20 ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
21 "jmp clear_page_c_e", X86_FEATURE_ERMS
22
11 movl $4096/8,%ecx 23 movl $4096/8,%ecx
12 xorl %eax,%eax 24 xorl %eax,%eax
13 rep stosq 25 rep stosq
14 ret 26 ret
15 CFI_ENDPROC 27 CFI_ENDPROC
16ENDPROC(clear_page_c) 28ENDPROC(clear_page)
17 29
18ENTRY(clear_page_c_e) 30ENTRY(clear_page_orig)
19 CFI_STARTPROC 31 CFI_STARTPROC
20 movl $4096,%ecx
21 xorl %eax,%eax
22 rep stosb
23 ret
24 CFI_ENDPROC
25ENDPROC(clear_page_c_e)
26 32
27ENTRY(clear_page)
28 CFI_STARTPROC
29 xorl %eax,%eax 33 xorl %eax,%eax
30 movl $4096/64,%ecx 34 movl $4096/64,%ecx
31 .p2align 4 35 .p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
45 nop 49 nop
46 ret 50 ret
47 CFI_ENDPROC 51 CFI_ENDPROC
48.Lclear_page_end: 52ENDPROC(clear_page_orig)
49ENDPROC(clear_page)
50
51 /*
52 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
53 * It is recommended to use this when possible.
54 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
55 * Otherwise, use original function.
56 *
57 */
58 53
59#include <asm/cpufeature.h> 54ENTRY(clear_page_c_e)
60 55 CFI_STARTPROC
61 .section .altinstr_replacement,"ax" 56 movl $4096,%ecx
621: .byte 0xeb /* jmp <disp8> */ 57 xorl %eax,%eax
63 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ 58 rep stosb
642: .byte 0xeb /* jmp <disp8> */ 59 ret
65 .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ 60 CFI_ENDPROC
663: 61ENDPROC(clear_page_c_e)
67 .previous
68 .section .altinstructions,"a"
69 altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
70 .Lclear_page_end-clear_page, 2b-1b
71 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
72 .Lclear_page_end-clear_page,3b-2b
73 .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca67212b..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
5#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
6 7
8/*
9 * Some CPUs run faster using the string copy instructions (sane microcode).
10 * It is also a lot simpler. Use this when possible. But, don't use streaming
11 * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
12 * prefetch distance based on SMP/UP.
13 */
7 ALIGN 14 ALIGN
8copy_page_rep: 15ENTRY(copy_page)
9 CFI_STARTPROC 16 CFI_STARTPROC
17 ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
10 movl $4096/8, %ecx 18 movl $4096/8, %ecx
11 rep movsq 19 rep movsq
12 ret 20 ret
13 CFI_ENDPROC 21 CFI_ENDPROC
14ENDPROC(copy_page_rep) 22ENDPROC(copy_page)
15
16/*
17 * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
18 * Could vary the prefetch distance based on SMP/UP.
19*/
20 23
21ENTRY(copy_page) 24ENTRY(copy_page_regs)
22 CFI_STARTPROC 25 CFI_STARTPROC
23 subq $2*8, %rsp 26 subq $2*8, %rsp
24 CFI_ADJUST_CFA_OFFSET 2*8 27 CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
90 addq $2*8, %rsp 93 addq $2*8, %rsp
91 CFI_ADJUST_CFA_OFFSET -2*8 94 CFI_ADJUST_CFA_OFFSET -2*8
92 ret 95 ret
93.Lcopy_page_end:
94 CFI_ENDPROC 96 CFI_ENDPROC
95ENDPROC(copy_page) 97ENDPROC(copy_page_regs)
96
97 /* Some CPUs run faster using the string copy instructions.
98 It is also a lot simpler. Use this when possible */
99
100#include <asm/cpufeature.h>
101
102 .section .altinstr_replacement,"ax"
1031: .byte 0xeb /* jmp <disp8> */
104 .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
1052:
106 .previous
107 .section .altinstructions,"a"
108 altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
109 .Lcopy_page_end-copy_page, 2b-1b
110 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d55594..fa997dfaef24 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
11
12#define FIX_ALIGNMENT 1
13
14#include <asm/current.h> 11#include <asm/current.h>
15#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
16#include <asm/thread_info.h> 13#include <asm/thread_info.h>
@@ -19,33 +16,7 @@
19#include <asm/asm.h> 16#include <asm/asm.h>
20#include <asm/smap.h> 17#include <asm/smap.h>
21 18
22/*
23 * By placing feature2 after feature1 in altinstructions section, we logically
24 * implement:
25 * If CPU has feature2, jmp to alt2 is used
26 * else if CPU has feature1, jmp to alt1 is used
27 * else jmp to orig is used.
28 */
29 .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
300:
31 .byte 0xe9 /* 32bit jump */
32 .long \orig-1f /* by default jump to orig */
331:
34 .section .altinstr_replacement,"ax"
352: .byte 0xe9 /* near jump with 32bit immediate */
36 .long \alt1-1b /* offset */ /* or alternatively to alt1 */
373: .byte 0xe9 /* near jump with 32bit immediate */
38 .long \alt2-1b /* offset */ /* or alternatively to alt2 */
39 .previous
40
41 .section .altinstructions,"a"
42 altinstruction_entry 0b,2b,\feature1,5,5
43 altinstruction_entry 0b,3b,\feature2,5,5
44 .previous
45 .endm
46
47 .macro ALIGN_DESTINATION 19 .macro ALIGN_DESTINATION
48#ifdef FIX_ALIGNMENT
49 /* check for bad alignment of destination */ 20 /* check for bad alignment of destination */
50 movl %edi,%ecx 21 movl %edi,%ecx
51 andl $7,%ecx 22 andl $7,%ecx
@@ -67,7 +38,6 @@
67 38
68 _ASM_EXTABLE(100b,103b) 39 _ASM_EXTABLE(100b,103b)
69 _ASM_EXTABLE(101b,103b) 40 _ASM_EXTABLE(101b,103b)
70#endif
71 .endm 41 .endm
72 42
73/* Standard copy_to_user with segment limit checking */ 43/* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
79 jc bad_to_user 49 jc bad_to_user
80 cmpq TI_addr_limit(%rax),%rcx 50 cmpq TI_addr_limit(%rax),%rcx
81 ja bad_to_user 51 ja bad_to_user
82 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ 52 ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
83 copy_user_generic_unrolled,copy_user_generic_string, \ 53 "jmp copy_user_generic_string", \
84 copy_user_enhanced_fast_string 54 X86_FEATURE_REP_GOOD, \
55 "jmp copy_user_enhanced_fast_string", \
56 X86_FEATURE_ERMS
85 CFI_ENDPROC 57 CFI_ENDPROC
86ENDPROC(_copy_to_user) 58ENDPROC(_copy_to_user)
87 59
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
94 jc bad_from_user 66 jc bad_from_user
95 cmpq TI_addr_limit(%rax),%rcx 67 cmpq TI_addr_limit(%rax),%rcx
96 ja bad_from_user 68 ja bad_from_user
97 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ 69 ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
98 copy_user_generic_unrolled,copy_user_generic_string, \ 70 "jmp copy_user_generic_string", \
99 copy_user_enhanced_fast_string 71 X86_FEATURE_REP_GOOD, \
72 "jmp copy_user_enhanced_fast_string", \
73 X86_FEATURE_ERMS
100 CFI_ENDPROC 74 CFI_ENDPROC
101ENDPROC(_copy_from_user) 75ENDPROC(_copy_from_user)
102 76
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5fefae3..9734182966f3 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
196 196
197 /* handle last odd byte */ 197 /* handle last odd byte */
198.Lhandle_1: 198.Lhandle_1:
199 testl $1, %r10d 199 testb $1, %r10b
200 jz .Lende 200 jz .Lende
201 xorl %ebx, %ebx 201 xorl %ebx, %ebx
202 source 202 source
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1313ae6b478b..8f72b334aea0 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -52,6 +52,13 @@
52 */ 52 */
53void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) 53void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
54{ 54{
55 /*
56 * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
57 * even if the input buffer is long enough to hold them.
58 */
59 if (buf_len > MAX_INSN_SIZE)
60 buf_len = MAX_INSN_SIZE;
61
55 memset(insn, 0, sizeof(*insn)); 62 memset(insn, 0, sizeof(*insn));
56 insn->kaddr = kaddr; 63 insn->kaddr = kaddr;
57 insn->end_kaddr = kaddr + buf_len; 64 insn->end_kaddr = kaddr + buf_len;
@@ -164,6 +171,12 @@ found:
164 /* VEX.W overrides opnd_size */ 171 /* VEX.W overrides opnd_size */
165 insn->opnd_bytes = 8; 172 insn->opnd_bytes = 8;
166 } else { 173 } else {
174 /*
175 * For VEX2, fake VEX3-like byte#2.
176 * Makes it easier to decode vex.W, vex.vvvv,
177 * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
178 */
179 insn->vex_prefix.bytes[2] = b2 & 0x7f;
167 insn->vex_prefix.nbytes = 2; 180 insn->vex_prefix.nbytes = 2;
168 insn->next_byte += 2; 181 insn->next_byte += 2;
169 } 182 }
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9968e7..b046664f5a1c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4
5#include <asm/cpufeature.h> 4#include <asm/cpufeature.h>
6#include <asm/dwarf2.h> 5#include <asm/dwarf2.h>
7#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
8 7
9/* 8/*
9 * We build a jump to memcpy_orig by default which gets NOPped out on
10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
13 */
14
15.weak memcpy
16
17/*
10 * memcpy - Copy a memory block. 18 * memcpy - Copy a memory block.
11 * 19 *
12 * Input: 20 * Input:
@@ -17,15 +25,11 @@
17 * Output: 25 * Output:
18 * rax original destination 26 * rax original destination
19 */ 27 */
28ENTRY(__memcpy)
29ENTRY(memcpy)
30 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
31 "jmp memcpy_erms", X86_FEATURE_ERMS
20 32
21/*
22 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
23 *
24 * This gets patched over the unrolled variant (below) via the
25 * alternative instructions framework:
26 */
27 .section .altinstr_replacement, "ax", @progbits
28.Lmemcpy_c:
29 movq %rdi, %rax 33 movq %rdi, %rax
30 movq %rdx, %rcx 34 movq %rdx, %rcx
31 shrq $3, %rcx 35 shrq $3, %rcx
@@ -34,29 +38,21 @@
34 movl %edx, %ecx 38 movl %edx, %ecx
35 rep movsb 39 rep movsb
36 ret 40 ret
37.Lmemcpy_e: 41ENDPROC(memcpy)
38 .previous 42ENDPROC(__memcpy)
39 43
40/* 44/*
41 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than 45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
42 * memcpy_c. Use memcpy_c_e when possible. 46 * simpler than memcpy. Use memcpy_erms when possible.
43 *
44 * This gets patched over the unrolled variant (below) via the
45 * alternative instructions framework:
46 */ 47 */
47 .section .altinstr_replacement, "ax", @progbits 48ENTRY(memcpy_erms)
48.Lmemcpy_c_e:
49 movq %rdi, %rax 49 movq %rdi, %rax
50 movq %rdx, %rcx 50 movq %rdx, %rcx
51 rep movsb 51 rep movsb
52 ret 52 ret
53.Lmemcpy_e_e: 53ENDPROC(memcpy_erms)
54 .previous
55
56.weak memcpy
57 54
58ENTRY(__memcpy) 55ENTRY(memcpy_orig)
59ENTRY(memcpy)
60 CFI_STARTPROC 56 CFI_STARTPROC
61 movq %rdi, %rax 57 movq %rdi, %rax
62 58
@@ -183,26 +179,4 @@ ENTRY(memcpy)
183.Lend: 179.Lend:
184 retq 180 retq
185 CFI_ENDPROC 181 CFI_ENDPROC
186ENDPROC(memcpy) 182ENDPROC(memcpy_orig)
187ENDPROC(__memcpy)
188
189 /*
190 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
191 * If the feature is supported, memcpy_c_e() is the first choice.
192 * If enhanced rep movsb copy is not available, use fast string copy
193 * memcpy_c() when possible. This is faster and code is simpler than
194 * original memcpy().
195 * Otherwise, original memcpy() is used.
196 * In .altinstructions section, ERMS feature is placed after REG_GOOD
197 * feature to implement the right patch order.
198 *
199 * Replace only beginning, memcpy is used to apply alternatives,
200 * so it is silly to overwrite itself with nops - reboot is the
201 * only outcome...
202 */
203 .section .altinstructions, "a"
204 altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
205 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
206 altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
207 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
208 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530575da..0f8a0d0331b9 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
5 * This assembly file is re-written from memmove_64.c file. 5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> 6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */ 7 */
8#define _STRING_C
9#include <linux/linkage.h> 8#include <linux/linkage.h>
10#include <asm/dwarf2.h> 9#include <asm/dwarf2.h>
11#include <asm/cpufeature.h> 10#include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
44 jg 2f 43 jg 2f
45 44
46.Lmemmove_begin_forward: 45.Lmemmove_begin_forward:
46 ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
47
47 /* 48 /*
48 * movsq instruction have many startup latency 49 * movsq instruction have many startup latency
49 * so we handle small size by general register. 50 * so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
20713: 20813:
208 retq 209 retq
209 CFI_ENDPROC 210 CFI_ENDPROC
210
211 .section .altinstr_replacement,"ax"
212.Lmemmove_begin_forward_efs:
213 /* Forward moving data. */
214 movq %rdx, %rcx
215 rep movsb
216 retq
217.Lmemmove_end_forward_efs:
218 .previous
219
220 .section .altinstructions,"a"
221 altinstruction_entry .Lmemmove_begin_forward, \
222 .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
223 .Lmemmove_end_forward-.Lmemmove_begin_forward, \
224 .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
225 .previous
226ENDPROC(__memmove) 211ENDPROC(__memmove)
227ENDPROC(memmove) 212ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935c6a60..93118fb23976 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
7 7
8.weak memset
9
8/* 10/*
9 * ISO C memset - set a memory block to a byte value. This function uses fast 11 * ISO C memset - set a memory block to a byte value. This function uses fast
10 * string to get better performance than the original function. The code is 12 * string to get better performance than the original function. The code is
11 * simpler and shorter than the orignal function as well. 13 * simpler and shorter than the orignal function as well.
12 * 14 *
13 * rdi destination 15 * rdi destination
14 * rsi value (char) 16 * rsi value (char)
15 * rdx count (bytes) 17 * rdx count (bytes)
16 * 18 *
17 * rax original destination 19 * rax original destination
18 */ 20 */
19 .section .altinstr_replacement, "ax", @progbits 21ENTRY(memset)
20.Lmemset_c: 22ENTRY(__memset)
23 /*
24 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
25 * to use it when possible. If not available, use fast string instructions.
26 *
27 * Otherwise, use original memset function.
28 */
29 ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
30 "jmp memset_erms", X86_FEATURE_ERMS
31
21 movq %rdi,%r9 32 movq %rdi,%r9
22 movq %rdx,%rcx 33 movq %rdx,%rcx
23 andl $7,%edx 34 andl $7,%edx
@@ -31,8 +42,8 @@
31 rep stosb 42 rep stosb
32 movq %r9,%rax 43 movq %r9,%rax
33 ret 44 ret
34.Lmemset_e: 45ENDPROC(memset)
35 .previous 46ENDPROC(__memset)
36 47
37/* 48/*
38 * ISO C memset - set a memory block to a byte value. This function uses 49 * ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
45 * 56 *
46 * rax original destination 57 * rax original destination
47 */ 58 */
48 .section .altinstr_replacement, "ax", @progbits 59ENTRY(memset_erms)
49.Lmemset_c_e:
50 movq %rdi,%r9 60 movq %rdi,%r9
51 movb %sil,%al 61 movb %sil,%al
52 movq %rdx,%rcx 62 movq %rdx,%rcx
53 rep stosb 63 rep stosb
54 movq %r9,%rax 64 movq %r9,%rax
55 ret 65 ret
56.Lmemset_e_e: 66ENDPROC(memset_erms)
57 .previous
58
59.weak memset
60 67
61ENTRY(memset) 68ENTRY(memset_orig)
62ENTRY(__memset)
63 CFI_STARTPROC 69 CFI_STARTPROC
64 movq %rdi,%r10 70 movq %rdi,%r10
65 71
@@ -134,23 +140,4 @@ ENTRY(__memset)
134 jmp .Lafter_bad_alignment 140 jmp .Lafter_bad_alignment
135.Lfinal: 141.Lfinal:
136 CFI_ENDPROC 142 CFI_ENDPROC
137ENDPROC(memset) 143ENDPROC(memset_orig)
138ENDPROC(__memset)
139
140 /* Some CPUs support enhanced REP MOVSB/STOSB feature.
141 * It is recommended to use this when possible.
142 *
143 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
144 * instructions.
145 *
146 * Otherwise, use original memset function.
147 *
148 * In .altinstructions section, ERMS feature is placed after REG_GOOD
149 * feature to implement the right patch order.
150 */
151 .section .altinstructions,"a"
152 altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
153 .Lfinal-__memset,.Lmemset_e-.Lmemset_c
154 altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
155 .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
156 .previous
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index f6d13eefad10..3ca5218fbece 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -14,8 +14,8 @@
14.macro op_safe_regs op 14.macro op_safe_regs op
15ENTRY(\op\()_safe_regs) 15ENTRY(\op\()_safe_regs)
16 CFI_STARTPROC 16 CFI_STARTPROC
17 pushq_cfi %rbx 17 pushq_cfi_reg rbx
18 pushq_cfi %rbp 18 pushq_cfi_reg rbp
19 movq %rdi, %r10 /* Save pointer */ 19 movq %rdi, %r10 /* Save pointer */
20 xorl %r11d, %r11d /* Return value */ 20 xorl %r11d, %r11d /* Return value */
21 movl (%rdi), %eax 21 movl (%rdi), %eax
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs)
35 movl %ebp, 20(%r10) 35 movl %ebp, 20(%r10)
36 movl %esi, 24(%r10) 36 movl %esi, 24(%r10)
37 movl %edi, 28(%r10) 37 movl %edi, 28(%r10)
38 popq_cfi %rbp 38 popq_cfi_reg rbp
39 popq_cfi %rbx 39 popq_cfi_reg rbx
40 ret 40 ret
413: 413:
42 CFI_RESTORE_STATE 42 CFI_RESTORE_STATE
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs)
53.macro op_safe_regs op 53.macro op_safe_regs op
54ENTRY(\op\()_safe_regs) 54ENTRY(\op\()_safe_regs)
55 CFI_STARTPROC 55 CFI_STARTPROC
56 pushl_cfi %ebx 56 pushl_cfi_reg ebx
57 pushl_cfi %ebp 57 pushl_cfi_reg ebp
58 pushl_cfi %esi 58 pushl_cfi_reg esi
59 pushl_cfi %edi 59 pushl_cfi_reg edi
60 pushl_cfi $0 /* Return value */ 60 pushl_cfi $0 /* Return value */
61 pushl_cfi %eax 61 pushl_cfi %eax
62 movl 4(%eax), %ecx 62 movl 4(%eax), %ecx
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs)
80 movl %esi, 24(%eax) 80 movl %esi, 24(%eax)
81 movl %edi, 28(%eax) 81 movl %edi, 28(%eax)
82 popl_cfi %eax 82 popl_cfi %eax
83 popl_cfi %edi 83 popl_cfi_reg edi
84 popl_cfi %esi 84 popl_cfi_reg esi
85 popl_cfi %ebp 85 popl_cfi_reg ebp
86 popl_cfi %ebx 86 popl_cfi_reg ebx
87 ret 87 ret
883: 883:
89 CFI_RESTORE_STATE 89 CFI_RESTORE_STATE
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 5dff5f042468..2322abe4da3b 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -34,10 +34,10 @@
34 */ 34 */
35 35
36#define save_common_regs \ 36#define save_common_regs \
37 pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 37 pushl_cfi_reg ecx
38 38
39#define restore_common_regs \ 39#define restore_common_regs \
40 popl_cfi %ecx; CFI_RESTORE ecx 40 popl_cfi_reg ecx
41 41
42 /* Avoid uglifying the argument copying x86-64 needs to do. */ 42 /* Avoid uglifying the argument copying x86-64 needs to do. */
43 .macro movq src, dst 43 .macro movq src, dst
@@ -64,22 +64,22 @@
64 */ 64 */
65 65
66#define save_common_regs \ 66#define save_common_regs \
67 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ 67 pushq_cfi_reg rdi; \
68 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ 68 pushq_cfi_reg rsi; \
69 pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ 69 pushq_cfi_reg rcx; \
70 pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ 70 pushq_cfi_reg r8; \
71 pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ 71 pushq_cfi_reg r9; \
72 pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ 72 pushq_cfi_reg r10; \
73 pushq_cfi %r11; CFI_REL_OFFSET r11, 0 73 pushq_cfi_reg r11
74 74
75#define restore_common_regs \ 75#define restore_common_regs \
76 popq_cfi %r11; CFI_RESTORE r11; \ 76 popq_cfi_reg r11; \
77 popq_cfi %r10; CFI_RESTORE r10; \ 77 popq_cfi_reg r10; \
78 popq_cfi %r9; CFI_RESTORE r9; \ 78 popq_cfi_reg r9; \
79 popq_cfi %r8; CFI_RESTORE r8; \ 79 popq_cfi_reg r8; \
80 popq_cfi %rcx; CFI_RESTORE rcx; \ 80 popq_cfi_reg rcx; \
81 popq_cfi %rsi; CFI_RESTORE rsi; \ 81 popq_cfi_reg rsi; \
82 popq_cfi %rdi; CFI_RESTORE rdi 82 popq_cfi_reg rdi
83 83
84#endif 84#endif
85 85
@@ -87,12 +87,10 @@
87ENTRY(call_rwsem_down_read_failed) 87ENTRY(call_rwsem_down_read_failed)
88 CFI_STARTPROC 88 CFI_STARTPROC
89 save_common_regs 89 save_common_regs
90 __ASM_SIZE(push,_cfi) %__ASM_REG(dx) 90 __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
91 CFI_REL_OFFSET __ASM_REG(dx), 0
92 movq %rax,%rdi 91 movq %rax,%rdi
93 call rwsem_down_read_failed 92 call rwsem_down_read_failed
94 __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) 93 __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
95 CFI_RESTORE __ASM_REG(dx)
96 restore_common_regs 94 restore_common_regs
97 ret 95 ret
98 CFI_ENDPROC 96 CFI_ENDPROC
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake)
124ENTRY(call_rwsem_downgrade_wake) 122ENTRY(call_rwsem_downgrade_wake)
125 CFI_STARTPROC 123 CFI_STARTPROC
126 save_common_regs 124 save_common_regs
127 __ASM_SIZE(push,_cfi) %__ASM_REG(dx) 125 __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
128 CFI_REL_OFFSET __ASM_REG(dx), 0
129 movq %rax,%rdi 126 movq %rax,%rdi
130 call rwsem_downgrade_wake 127 call rwsem_downgrade_wake
131 __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) 128 __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
132 CFI_RESTORE __ASM_REG(dx)
133 restore_common_regs 129 restore_common_regs
134 ret 130 ret
135 CFI_ENDPROC 131 CFI_ENDPROC
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index e28cdaf5ac2c..5eb715087b80 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -13,12 +13,9 @@
13 .globl \name 13 .globl \name
14\name: 14\name:
15 CFI_STARTPROC 15 CFI_STARTPROC
16 pushl_cfi %eax 16 pushl_cfi_reg eax
17 CFI_REL_OFFSET eax, 0 17 pushl_cfi_reg ecx
18 pushl_cfi %ecx 18 pushl_cfi_reg edx
19 CFI_REL_OFFSET ecx, 0
20 pushl_cfi %edx
21 CFI_REL_OFFSET edx, 0
22 19
23 .if \put_ret_addr_in_eax 20 .if \put_ret_addr_in_eax
24 /* Place EIP in the arg1 */ 21 /* Place EIP in the arg1 */
@@ -26,12 +23,9 @@
26 .endif 23 .endif
27 24
28 call \func 25 call \func
29 popl_cfi %edx 26 popl_cfi_reg edx
30 CFI_RESTORE edx 27 popl_cfi_reg ecx
31 popl_cfi %ecx 28 popl_cfi_reg eax
32 CFI_RESTORE ecx
33 popl_cfi %eax
34 CFI_RESTORE eax
35 ret 29 ret
36 CFI_ENDPROC 30 CFI_ENDPROC
37 _ASM_NOKPROBE(\name) 31 _ASM_NOKPROBE(\name)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index b30b5ebd614a..f89ba4e93025 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -17,9 +17,18 @@
17 CFI_STARTPROC 17 CFI_STARTPROC
18 18
19 /* this one pushes 9 elems, the next one would be %rIP */ 19 /* this one pushes 9 elems, the next one would be %rIP */
20 SAVE_ARGS 20 pushq_cfi_reg rdi
21 pushq_cfi_reg rsi
22 pushq_cfi_reg rdx
23 pushq_cfi_reg rcx
24 pushq_cfi_reg rax
25 pushq_cfi_reg r8
26 pushq_cfi_reg r9
27 pushq_cfi_reg r10
28 pushq_cfi_reg r11
21 29
22 .if \put_ret_addr_in_rdi 30 .if \put_ret_addr_in_rdi
31 /* 9*8(%rsp) is return addr on stack */
23 movq_cfi_restore 9*8, rdi 32 movq_cfi_restore 9*8, rdi
24 .endif 33 .endif
25 34
@@ -45,11 +54,22 @@
45#endif 54#endif
46#endif 55#endif
47 56
48 /* SAVE_ARGS below is used only for the .cfi directives it contains. */ 57#if defined(CONFIG_TRACE_IRQFLAGS) \
58 || defined(CONFIG_DEBUG_LOCK_ALLOC) \
59 || defined(CONFIG_PREEMPT)
49 CFI_STARTPROC 60 CFI_STARTPROC
50 SAVE_ARGS 61 CFI_ADJUST_CFA_OFFSET 9*8
51restore: 62restore:
52 RESTORE_ARGS 63 popq_cfi_reg r11
64 popq_cfi_reg r10
65 popq_cfi_reg r9
66 popq_cfi_reg r8
67 popq_cfi_reg rax
68 popq_cfi_reg rcx
69 popq_cfi_reg rdx
70 popq_cfi_reg rsi
71 popq_cfi_reg rdi
53 ret 72 ret
54 CFI_ENDPROC 73 CFI_ENDPROC
55 _ASM_NOKPROBE(restore) 74 _ASM_NOKPROBE(restore)
75#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1a2be7c6895d..816488c0b97e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -273,6 +273,9 @@ dd: ESC
273de: ESC 273de: ESC
274df: ESC 274df: ESC
275# 0xe0 - 0xef 275# 0xe0 - 0xef
276# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
277# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
278# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
276e0: LOOPNE/LOOPNZ Jb (f64) 279e0: LOOPNE/LOOPNZ Jb (f64)
277e1: LOOPE/LOOPZ Jb (f64) 280e1: LOOPE/LOOPZ Jb (f64)
278e2: LOOP Jb (f64) 281e2: LOOP Jb (f64)
@@ -281,6 +284,10 @@ e4: IN AL,Ib
281e5: IN eAX,Ib 284e5: IN eAX,Ib
282e6: OUT Ib,AL 285e6: OUT Ib,AL
283e7: OUT Ib,eAX 286e7: OUT Ib,eAX
287# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
288# in "near" jumps and calls is 16-bit. For CALL,
289# push of return address is 16-bit wide, RSP is decremented by 2
290# but is not truncated to 16 bits, unlike RIP.
284e8: CALL Jz (f64) 291e8: CALL Jz (f64)
285e9: JMP-near Jz (f64) 292e9: JMP-near Jz (f64)
286ea: JMP-far Ap (i64) 293ea: JMP-far Ap (i64)
@@ -456,6 +463,7 @@ AVXcode: 1
4567e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) 4637e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
4577f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) 4647f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
458# 0x0f 0x80-0x8f 465# 0x0f 0x80-0x8f
466# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
45980: JO Jz (f64) 46780: JO Jz (f64)
46081: JNO Jz (f64) 46881: JNO Jz (f64)
46182: JB/JC/JNAE Jz (f64) 46982: JB/JC/JNAE Jz (f64)
@@ -842,6 +850,7 @@ EndTable
842GrpTable: Grp5 850GrpTable: Grp5
8430: INC Ev 8510: INC Ev
8441: DEC Ev 8521: DEC Ev
853# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
8452: CALLN Ev (f64) 8542: CALLN Ev (f64)
8463: CALLF Ep 8553: CALLF Ep
8474: JMPN Ev (f64) 8564: JMPN Ev (f64)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
32obj-$(CONFIG_ACPI_NUMA) += srat.o 32obj-$(CONFIG_ACPI_NUMA) += srat.o
33obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 33obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
34 34
35obj-$(CONFIG_MEMTEST) += memtest.o
36
37obj-$(CONFIG_X86_INTEL_MPX) += mpx.o 35obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ede025fb46f1..181c53bac3a7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
59 int ret = 0; 59 int ret = 0;
60 60
61 /* kprobe_running() needs smp_processor_id() */ 61 /* kprobe_running() needs smp_processor_id() */
62 if (kprobes_built_in() && !user_mode_vm(regs)) { 62 if (kprobes_built_in() && !user_mode(regs)) {
63 preempt_disable(); 63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 65 ret = 1;
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
148 instr = (void *)convert_ip_to_linear(current, regs); 148 instr = (void *)convert_ip_to_linear(current, regs);
149 max_instr = instr + 15; 149 max_instr = instr + 15;
150 150
151 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 151 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
152 return 0; 152 return 0;
153 153
154 while (instr < max_instr) { 154 while (instr < max_instr) {
@@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
1035 if (error_code & PF_USER) 1035 if (error_code & PF_USER)
1036 return false; 1036 return false;
1037 1037
1038 if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) 1038 if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
1039 return false; 1039 return false;
1040 1040
1041 return true; 1041 return true;
@@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1140 * User-mode registers count as a user access even for any 1140 * User-mode registers count as a user access even for any
1141 * potential system fault or CPU buglet: 1141 * potential system fault or CPU buglet:
1142 */ 1142 */
1143 if (user_mode_vm(regs)) { 1143 if (user_mode(regs)) {
1144 local_irq_enable(); 1144 local_irq_enable();
1145 error_code |= PF_USER; 1145 error_code |= PF_USER;
1146 flags |= FAULT_FLAG_USER; 1146 flags |= FAULT_FLAG_USER;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a110efca6d06..1d553186c434 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,29 +29,33 @@
29 29
30/* 30/*
31 * Tables translating between page_cache_type_t and pte encoding. 31 * Tables translating between page_cache_type_t and pte encoding.
32 * Minimal supported modes are defined statically, modified if more supported 32 *
33 * cache modes are available. 33 * Minimal supported modes are defined statically, they are modified
34 * Index into __cachemode2pte_tbl is the cachemode. 34 * during bootup if more supported cache modes are available.
35 * Index into __pte2cachemode_tbl are the caching attribute bits of the pte 35 *
36 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. 36 * Index into __cachemode2pte_tbl[] is the cachemode.
37 *
38 * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
39 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
37 */ 40 */
38uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { 41uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
39 [_PAGE_CACHE_MODE_WB] = 0, 42 [_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
40 [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, 43 [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 ,
41 [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, 44 [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
42 [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, 45 [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
43 [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, 46 [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
44 [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, 47 [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
45}; 48};
46EXPORT_SYMBOL(__cachemode2pte_tbl); 49EXPORT_SYMBOL(__cachemode2pte_tbl);
50
47uint8_t __pte2cachemode_tbl[8] = { 51uint8_t __pte2cachemode_tbl[8] = {
48 [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, 52 [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
49 [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, 53 [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC,
50 [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, 54 [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
51 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, 55 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC,
52 [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, 56 [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
53 [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, 57 [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
54 [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, 58 [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
55 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, 59 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
56}; 60};
57EXPORT_SYMBOL(__pte2cachemode_tbl); 61EXPORT_SYMBOL(__pte2cachemode_tbl);
@@ -131,21 +135,7 @@ void __init early_alloc_pgt_buf(void)
131 135
132int after_bootmem; 136int after_bootmem;
133 137
134int direct_gbpages 138early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
135#ifdef CONFIG_DIRECT_GBPAGES
136 = 1
137#endif
138;
139
140static void __init init_gbpages(void)
141{
142#ifdef CONFIG_X86_64
143 if (direct_gbpages && cpu_has_gbpages)
144 printk(KERN_INFO "Using GB pages for direct mapping\n");
145 else
146 direct_gbpages = 0;
147#endif
148}
149 139
150struct map_range { 140struct map_range {
151 unsigned long start; 141 unsigned long start;
@@ -157,16 +147,12 @@ static int page_size_mask;
157 147
158static void __init probe_page_size_mask(void) 148static void __init probe_page_size_mask(void)
159{ 149{
160 init_gbpages();
161
162#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) 150#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
163 /* 151 /*
164 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 152 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
165 * This will simplify cpa(), which otherwise needs to support splitting 153 * This will simplify cpa(), which otherwise needs to support splitting
166 * large pages into small in interrupt context, etc. 154 * large pages into small in interrupt context, etc.
167 */ 155 */
168 if (direct_gbpages)
169 page_size_mask |= 1 << PG_LEVEL_1G;
170 if (cpu_has_pse) 156 if (cpu_has_pse)
171 page_size_mask |= 1 << PG_LEVEL_2M; 157 page_size_mask |= 1 << PG_LEVEL_2M;
172#endif 158#endif
@@ -179,6 +165,15 @@ static void __init probe_page_size_mask(void)
179 if (cpu_has_pge) { 165 if (cpu_has_pge) {
180 cr4_set_bits_and_update_boot(X86_CR4_PGE); 166 cr4_set_bits_and_update_boot(X86_CR4_PGE);
181 __supported_pte_mask |= _PAGE_GLOBAL; 167 __supported_pte_mask |= _PAGE_GLOBAL;
168 } else
169 __supported_pte_mask &= ~_PAGE_GLOBAL;
170
171 /* Enable 1 GB linear kernel mappings if available: */
172 if (direct_gbpages && cpu_has_gbpages) {
173 printk(KERN_INFO "Using GB pages for direct mapping\n");
174 page_size_mask |= 1 << PG_LEVEL_1G;
175 } else {
176 direct_gbpages = 0;
182 } 177 }
183} 178}
184 179
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 30eb05ae7061..3fba623e3ba5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
130 return 0; 130 return 0;
131} 131}
132 132
133static int __init parse_direct_gbpages_off(char *arg)
134{
135 direct_gbpages = 0;
136 return 0;
137}
138early_param("nogbpages", parse_direct_gbpages_off);
139
140static int __init parse_direct_gbpages_on(char *arg)
141{
142 direct_gbpages = 1;
143 return 0;
144}
145early_param("gbpages", parse_direct_gbpages_on);
146
147/* 133/*
148 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the 134 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
149 * physical space so we can cache the place of the first one and move 135 * physical space so we can cache the place of the first one and move
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..5ead4d6cf3a7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
67 67
68/* 68/*
69 * Remap an arbitrary physical address space into the kernel virtual 69 * Remap an arbitrary physical address space into the kernel virtual
70 * address space. Needed when the kernel wants to access high addresses 70 * address space. It transparently creates kernel huge I/O mapping when
71 * directly. 71 * the physical address is aligned by a huge page size (1GB or 2MB) and
72 * the requested size is at least the huge page size.
73 *
74 * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
75 * Therefore, the mapping code falls back to use a smaller page toward 4KB
76 * when a mapping range is covered by non-WB type of MTRRs.
72 * 77 *
73 * NOTE! We need to allow non-page-aligned mappings too: we will obviously 78 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
74 * have to convert them into an offset in a page-aligned mapping, but the 79 * have to convert them into an offset in a page-aligned mapping, but the
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr)
326} 331}
327EXPORT_SYMBOL(iounmap); 332EXPORT_SYMBOL(iounmap);
328 333
334int arch_ioremap_pud_supported(void)
335{
336#ifdef CONFIG_X86_64
337 return cpu_has_gbpages;
338#else
339 return 0;
340#endif
341}
342
343int arch_ioremap_pmd_supported(void)
344{
345 return cpu_has_pse;
346}
347
329/* 348/*
330 * Convert a physical pointer to a virtual kernel pointer for /dev/mem 349 * Convert a physical pointer to a virtual kernel pointer for /dev/mem
331 * access 350 * access
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
deleted file mode 100644
index 1e9da795767a..000000000000
--- a/arch/x86/mm/memtest.c
+++ /dev/null
@@ -1,118 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9#include <linux/memblock.h>
10
11static u64 patterns[] __initdata = {
12 /* The first entry has to be 0 to leave memtest with zeroed memory */
13 0,
14 0xffffffffffffffffULL,
15 0x5555555555555555ULL,
16 0xaaaaaaaaaaaaaaaaULL,
17 0x1111111111111111ULL,
18 0x2222222222222222ULL,
19 0x4444444444444444ULL,
20 0x8888888888888888ULL,
21 0x3333333333333333ULL,
22 0x6666666666666666ULL,
23 0x9999999999999999ULL,
24 0xccccccccccccccccULL,
25 0x7777777777777777ULL,
26 0xbbbbbbbbbbbbbbbbULL,
27 0xddddddddddddddddULL,
28 0xeeeeeeeeeeeeeeeeULL,
29 0x7a6c7258554e494cULL, /* yeah ;-) */
30};
31
32static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
33{
34 printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
35 (unsigned long long) pattern,
36 (unsigned long long) start_bad,
37 (unsigned long long) end_bad);
38 memblock_reserve(start_bad, end_bad - start_bad);
39}
40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{
43 u64 *p, *start, *end;
44 u64 start_bad, last_bad;
45 u64 start_phys_aligned;
46 const size_t incr = sizeof(pattern);
47
48 start_phys_aligned = ALIGN(start_phys, incr);
49 start = __va(start_phys_aligned);
50 end = start + (size - (start_phys_aligned - start_phys)) / incr;
51 start_bad = 0;
52 last_bad = 0;
53
54 for (p = start; p < end; p++)
55 *p = pattern;
56
57 for (p = start; p < end; p++, start_phys_aligned += incr) {
58 if (*p == pattern)
59 continue;
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 continue;
63 }
64 if (start_bad)
65 reserve_bad_mem(pattern, start_bad, last_bad + incr);
66 start_bad = last_bad = start_phys_aligned;
67 }
68 if (start_bad)
69 reserve_bad_mem(pattern, start_bad, last_bad + incr);
70}
71
72static void __init do_one_pass(u64 pattern, u64 start, u64 end)
73{
74 u64 i;
75 phys_addr_t this_start, this_end;
76
77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
78 this_start = clamp_t(phys_addr_t, this_start, start, end);
79 this_end = clamp_t(phys_addr_t, this_end, start, end);
80 if (this_start < this_end) {
81 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
82 (unsigned long long)this_start,
83 (unsigned long long)this_end,
84 (unsigned long long)cpu_to_be64(pattern));
85 memtest(pattern, this_start, this_end - this_start);
86 }
87 }
88}
89
90/* default is disabled */
91static int memtest_pattern __initdata;
92
93static int __init parse_memtest(char *arg)
94{
95 if (arg)
96 memtest_pattern = simple_strtoul(arg, NULL, 0);
97 else
98 memtest_pattern = ARRAY_SIZE(patterns);
99
100 return 0;
101}
102
103early_param("memtest", parse_memtest);
104
105void __init early_memtest(unsigned long start, unsigned long end)
106{
107 unsigned int i;
108 unsigned int idx = 0;
109
110 if (!memtest_pattern)
111 return;
112
113 printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
114 for (i = memtest_pattern-1; i < UINT_MAX; --i) {
115 idx = i % ARRAY_SIZE(patterns);
116 do_one_pass(patterns[idx], start, end);
117 }
118}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,24 +65,23 @@ static int mmap_is_legacy(void)
65 return sysctl_legacy_va_layout; 65 return sysctl_legacy_va_layout;
66} 66}
67 67
68static unsigned long mmap_rnd(void) 68unsigned long arch_mmap_rnd(void)
69{ 69{
70 unsigned long rnd = 0; 70 unsigned long rnd;
71 71
72 /* 72 /*
73 * 8 bits of randomness in 32bit mmaps, 20 address space bits 73 * 8 bits of randomness in 32bit mmaps, 20 address space bits
74 * 28 bits of randomness in 64bit mmaps, 40 address space bits 74 * 28 bits of randomness in 64bit mmaps, 40 address space bits
75 */ 75 */
76 if (current->flags & PF_RANDOMIZE) { 76 if (mmap_is_ia32())
77 if (mmap_is_ia32()) 77 rnd = (unsigned long)get_random_int() % (1<<8);
78 rnd = get_random_int() % (1<<8); 78 else
79 else 79 rnd = (unsigned long)get_random_int() % (1<<28);
80 rnd = get_random_int() % (1<<28); 80
81 }
82 return rnd << PAGE_SHIFT; 81 return rnd << PAGE_SHIFT;
83} 82}
84 83
85static unsigned long mmap_base(void) 84static unsigned long mmap_base(unsigned long rnd)
86{ 85{
87 unsigned long gap = rlimit(RLIMIT_STACK); 86 unsigned long gap = rlimit(RLIMIT_STACK);
88 87
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
91 else if (gap > MAX_GAP) 90 else if (gap > MAX_GAP)
92 gap = MAX_GAP; 91 gap = MAX_GAP;
93 92
94 return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); 93 return PAGE_ALIGN(TASK_SIZE - gap - rnd);
95} 94}
96 95
97/* 96/*
98 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
99 * does, but not when emulating X86_32 98 * does, but not when emulating X86_32
100 */ 99 */
101static unsigned long mmap_legacy_base(void) 100static unsigned long mmap_legacy_base(unsigned long rnd)
102{ 101{
103 if (mmap_is_ia32()) 102 if (mmap_is_ia32())
104 return TASK_UNMAPPED_BASE; 103 return TASK_UNMAPPED_BASE;
105 else 104 else
106 return TASK_UNMAPPED_BASE + mmap_rnd(); 105 return TASK_UNMAPPED_BASE + rnd;
107} 106}
108 107
109/* 108/*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
112 */ 111 */
113void arch_pick_mmap_layout(struct mm_struct *mm) 112void arch_pick_mmap_layout(struct mm_struct *mm)
114{ 113{
115 mm->mmap_legacy_base = mmap_legacy_base(); 114 unsigned long random_factor = 0UL;
116 mm->mmap_base = mmap_base(); 115
116 if (current->flags & PF_RANDOMIZE)
117 random_factor = arch_mmap_rnd();
118
119 mm->mmap_legacy_base = mmap_legacy_base(random_factor);
117 120
118 if (mmap_is_legacy()) { 121 if (mmap_is_legacy()) {
119 mm->mmap_base = mm->mmap_legacy_base; 122 mm->mmap_base = mm->mmap_legacy_base;
120 mm->get_unmapped_area = arch_get_unmapped_area; 123 mm->get_unmapped_area = arch_get_unmapped_area;
121 } else { 124 } else {
125 mm->mmap_base = mmap_base(random_factor);
122 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 126 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
123 } 127 }
124} 128}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index cd4785bbacb9..4053bb58bf92 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -482,9 +482,16 @@ static void __init numa_clear_kernel_node_hotplug(void)
482 &memblock.reserved, mb->nid); 482 &memblock.reserved, mb->nid);
483 } 483 }
484 484
485 /* Mark all kernel nodes. */ 485 /*
486 * Mark all kernel nodes.
487 *
488 * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
489 * may not include all the memblock.reserved memory ranges because
490 * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
491 */
486 for_each_memblock(reserved, r) 492 for_each_memblock(reserved, r)
487 node_set(r->nid, numa_kernel_nodes); 493 if (r->nid != MAX_NUMNODES)
494 node_set(r->nid, numa_kernel_nodes);
488 495
489 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ 496 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
490 for (i = 0; i < numa_meminfo.nr_blks; i++) { 497 for (i = 0; i < numa_meminfo.nr_blks; i++) {
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 536ea2fb6e33..89af288ec674 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m)
81 seq_printf(m, "DirectMap4M: %8lu kB\n", 81 seq_printf(m, "DirectMap4M: %8lu kB\n",
82 direct_pages_count[PG_LEVEL_2M] << 12); 82 direct_pages_count[PG_LEVEL_2M] << 12);
83#endif 83#endif
84#ifdef CONFIG_X86_64
85 if (direct_gbpages) 84 if (direct_gbpages)
86 seq_printf(m, "DirectMap1G: %8lu kB\n", 85 seq_printf(m, "DirectMap1G: %8lu kB\n",
87 direct_pages_count[PG_LEVEL_1G] << 20); 86 direct_pages_count[PG_LEVEL_1G] << 20);
88#endif
89} 87}
90#else 88#else
91static inline void split_page_count(int level) { } 89static inline void split_page_count(int level) { }
@@ -1654,13 +1652,11 @@ int set_memory_ro(unsigned long addr, int numpages)
1654{ 1652{
1655 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); 1653 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1656} 1654}
1657EXPORT_SYMBOL_GPL(set_memory_ro);
1658 1655
1659int set_memory_rw(unsigned long addr, int numpages) 1656int set_memory_rw(unsigned long addr, int numpages)
1660{ 1657{
1661 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); 1658 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1662} 1659}
1663EXPORT_SYMBOL_GPL(set_memory_rw);
1664 1660
1665int set_memory_np(unsigned long addr, int numpages) 1661int set_memory_np(unsigned long addr, int numpages)
1666{ 1662{
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 7ac68698406c..35af6771a95a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
610} 610}
611 611
612#ifdef CONFIG_STRICT_DEVMEM 612#ifdef CONFIG_STRICT_DEVMEM
613/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ 613/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
614static inline int range_is_allowed(unsigned long pfn, unsigned long size) 614static inline int range_is_allowed(unsigned long pfn, unsigned long size)
615{ 615{
616 return 1; 616 return 1;
@@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
628 628
629 while (cursor < to) { 629 while (cursor < to) {
630 if (!devmem_is_allowed(pfn)) { 630 if (!devmem_is_allowed(pfn)) {
631 printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", 631 printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
632 current->comm, from, to - 1); 632 current->comm, from, to - 1);
633 return 0; 633 return 0;
634 } 634 }
635 cursor += PAGE_SIZE; 635 cursor += PAGE_SIZE;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7b22adaad4f1..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
4#include <asm/pgtable.h> 4#include <asm/pgtable.h>
5#include <asm/tlb.h> 5#include <asm/tlb.h>
6#include <asm/fixmap.h> 6#include <asm/fixmap.h>
7#include <asm/mtrr.h>
7 8
8#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 9#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
9 10
@@ -58,7 +59,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
58 tlb_remove_page(tlb, pte); 59 tlb_remove_page(tlb, pte);
59} 60}
60 61
61#if PAGETABLE_LEVELS > 2 62#if CONFIG_PGTABLE_LEVELS > 2
62void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 63void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
63{ 64{
64 struct page *page = virt_to_page(pmd); 65 struct page *page = virt_to_page(pmd);
@@ -74,14 +75,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
74 tlb_remove_page(tlb, page); 75 tlb_remove_page(tlb, page);
75} 76}
76 77
77#if PAGETABLE_LEVELS > 3 78#if CONFIG_PGTABLE_LEVELS > 3
78void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 79void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
79{ 80{
80 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
81 tlb_remove_page(tlb, virt_to_page(pud)); 82 tlb_remove_page(tlb, virt_to_page(pud));
82} 83}
83#endif /* PAGETABLE_LEVELS > 3 */ 84#endif /* CONFIG_PGTABLE_LEVELS > 3 */
84#endif /* PAGETABLE_LEVELS > 2 */ 85#endif /* CONFIG_PGTABLE_LEVELS > 2 */
85 86
86static inline void pgd_list_add(pgd_t *pgd) 87static inline void pgd_list_add(pgd_t *pgd)
87{ 88{
@@ -117,9 +118,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
117 /* If the pgd points to a shared pagetable level (either the 118 /* If the pgd points to a shared pagetable level (either the
118 ptes in non-PAE, or shared PMD in PAE), then just copy the 119 ptes in non-PAE, or shared PMD in PAE), then just copy the
119 references from swapper_pg_dir. */ 120 references from swapper_pg_dir. */
120 if (PAGETABLE_LEVELS == 2 || 121 if (CONFIG_PGTABLE_LEVELS == 2 ||
121 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || 122 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
122 PAGETABLE_LEVELS == 4) { 123 CONFIG_PGTABLE_LEVELS == 4) {
123 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 124 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
124 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 125 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
125 KERNEL_PGD_PTRS); 126 KERNEL_PGD_PTRS);
@@ -275,12 +276,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
275 } 276 }
276} 277}
277 278
279/*
280 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
281 * assumes that pgd should be in one page.
282 *
283 * But kernel with PAE paging that is not running as a Xen domain
284 * only needs to allocate 32 bytes for pgd instead of one page.
285 */
286#ifdef CONFIG_X86_PAE
287
288#include <linux/slab.h>
289
290#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
291#define PGD_ALIGN 32
292
293static struct kmem_cache *pgd_cache;
294
295static int __init pgd_cache_init(void)
296{
297 /*
298 * When PAE kernel is running as a Xen domain, it does not use
299 * shared kernel pmd. And this requires a whole page for pgd.
300 */
301 if (!SHARED_KERNEL_PMD)
302 return 0;
303
304 /*
305 * when PAE kernel is not running as a Xen domain, it uses
306 * shared kernel pmd. Shared kernel pmd does not require a whole
307 * page for pgd. We are able to just allocate a 32-byte for pgd.
308 * During boot time, we create a 32-byte slab for pgd table allocation.
309 */
310 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
311 SLAB_PANIC, NULL);
312 if (!pgd_cache)
313 return -ENOMEM;
314
315 return 0;
316}
317core_initcall(pgd_cache_init);
318
319static inline pgd_t *_pgd_alloc(void)
320{
321 /*
322 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
323 * We allocate one page for pgd.
324 */
325 if (!SHARED_KERNEL_PMD)
326 return (pgd_t *)__get_free_page(PGALLOC_GFP);
327
328 /*
329 * Now PAE kernel is not running as a Xen domain. We can allocate
330 * a 32-byte slab for pgd to save memory space.
331 */
332 return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
333}
334
335static inline void _pgd_free(pgd_t *pgd)
336{
337 if (!SHARED_KERNEL_PMD)
338 free_page((unsigned long)pgd);
339 else
340 kmem_cache_free(pgd_cache, pgd);
341}
342#else
343static inline pgd_t *_pgd_alloc(void)
344{
345 return (pgd_t *)__get_free_page(PGALLOC_GFP);
346}
347
348static inline void _pgd_free(pgd_t *pgd)
349{
350 free_page((unsigned long)pgd);
351}
352#endif /* CONFIG_X86_PAE */
353
278pgd_t *pgd_alloc(struct mm_struct *mm) 354pgd_t *pgd_alloc(struct mm_struct *mm)
279{ 355{
280 pgd_t *pgd; 356 pgd_t *pgd;
281 pmd_t *pmds[PREALLOCATED_PMDS]; 357 pmd_t *pmds[PREALLOCATED_PMDS];
282 358
283 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); 359 pgd = _pgd_alloc();
284 360
285 if (pgd == NULL) 361 if (pgd == NULL)
286 goto out; 362 goto out;
@@ -310,7 +386,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
310out_free_pmds: 386out_free_pmds:
311 free_pmds(mm, pmds); 387 free_pmds(mm, pmds);
312out_free_pgd: 388out_free_pgd:
313 free_page((unsigned long)pgd); 389 _pgd_free(pgd);
314out: 390out:
315 return NULL; 391 return NULL;
316} 392}
@@ -320,7 +396,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
320 pgd_mop_up_pmds(mm, pgd); 396 pgd_mop_up_pmds(mm, pgd);
321 pgd_dtor(pgd); 397 pgd_dtor(pgd);
322 paravirt_pgd_free(mm, pgd); 398 paravirt_pgd_free(mm, pgd);
323 free_page((unsigned long)pgd); 399 _pgd_free(pgd);
324} 400}
325 401
326/* 402/*
@@ -485,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
485{ 561{
486 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 562 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
487} 563}
564
565#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
566int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
567{
568 u8 mtrr;
569
570 /*
571 * Do not use a huge page when the range is covered by non-WB type
572 * of MTRRs.
573 */
574 mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
575 if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
576 return 0;
577
578 prot = pgprot_4k_2_large(prot);
579
580 set_pte((pte_t *)pud, pfn_pte(
581 (u64)addr >> PAGE_SHIFT,
582 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
583
584 return 1;
585}
586
587int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
588{
589 u8 mtrr;
590
591 /*
592 * Do not use a huge page when the range is covered by non-WB type
593 * of MTRRs.
594 */
595 mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
596 if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
597 return 0;
598
599 prot = pgprot_4k_2_large(prot);
600
601 set_pte((pte_t *)pmd, pfn_pte(
602 (u64)addr >> PAGE_SHIFT,
603 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
604
605 return 1;
606}
607
608int pud_clear_huge(pud_t *pud)
609{
610 if (pud_large(*pud)) {
611 pud_clear(pud);
612 return 1;
613 }
614
615 return 0;
616}
617
618int pmd_clear_huge(pmd_t *pmd)
619{
620 if (pmd_large(*pmd)) {
621 pmd_clear(pmd);
622 return 1;
623 }
624
625 return 0;
626}
627#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 5d04be5efb64..4e664bdb535a 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
111{ 111{
112 struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); 112 struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
113 113
114 if (!user_mode_vm(regs)) { 114 if (!user_mode(regs)) {
115 unsigned long stack = kernel_stack_pointer(regs); 115 unsigned long stack = kernel_stack_pointer(regs);
116 if (depth) 116 if (depth)
117 dump_trace(NULL, regs, (unsigned long *)stack, 0, 117 dump_trace(NULL, regs, (unsigned long *)stack, 0,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2fb384724ebb..8fd6f44aee83 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum)
490 if (!bus) { 490 if (!bus) {
491 pci_free_resource_list(&resources); 491 pci_free_resource_list(&resources);
492 kfree(sd); 492 kfree(sd);
493 return;
493 } 494 }
495 pci_bus_add_devices(bus);
494} 496}
495 497
496void __init pcibios_set_cache_line_size(void) 498void __init pcibios_set_cache_line_size(void)
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index d143d216d52b..d7f997f7c26d 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -67,7 +67,7 @@ void __init efi_bgrt_init(void)
67 67
68 image = efi_lookup_mapped_addr(bgrt_tab->image_address); 68 image = efi_lookup_mapped_addr(bgrt_tab->image_address);
69 if (!image) { 69 if (!image) {
70 image = early_memremap(bgrt_tab->image_address, 70 image = early_ioremap(bgrt_tab->image_address,
71 sizeof(bmp_header)); 71 sizeof(bmp_header));
72 ioremapped = true; 72 ioremapped = true;
73 if (!image) { 73 if (!image) {
@@ -89,7 +89,7 @@ void __init efi_bgrt_init(void)
89 } 89 }
90 90
91 if (ioremapped) { 91 if (ioremapped) {
92 image = early_memremap(bgrt_tab->image_address, 92 image = early_ioremap(bgrt_tab->image_address,
93 bmp_header.size); 93 bmp_header.size);
94 if (!image) { 94 if (!image) {
95 pr_err("Ignoring BGRT: failed to map image memory\n"); 95 pr_err("Ignoring BGRT: failed to map image memory\n");
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index dbc8627a5cdf..02744df576d5 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -85,12 +85,20 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
85 efi_memory_desc_t *virtual_map) 85 efi_memory_desc_t *virtual_map)
86{ 86{
87 efi_status_t status; 87 efi_status_t status;
88 unsigned long flags;
89 pgd_t *save_pgd;
88 90
89 efi_call_phys_prolog(); 91 save_pgd = efi_call_phys_prolog();
92
93 /* Disable interrupts around EFI calls: */
94 local_irq_save(flags);
90 status = efi_call_phys(efi_phys.set_virtual_address_map, 95 status = efi_call_phys(efi_phys.set_virtual_address_map,
91 memory_map_size, descriptor_size, 96 memory_map_size, descriptor_size,
92 descriptor_version, virtual_map); 97 descriptor_version, virtual_map);
93 efi_call_phys_epilog(); 98 local_irq_restore(flags);
99
100 efi_call_phys_epilog(save_pgd);
101
94 return status; 102 return status;
95} 103}
96 104
@@ -491,7 +499,8 @@ void __init efi_init(void)
491 if (efi_memmap_init()) 499 if (efi_memmap_init())
492 return; 500 return;
493 501
494 print_efi_memmap(); 502 if (efi_enabled(EFI_DBG))
503 print_efi_memmap();
495} 504}
496 505
497void __init efi_late_init(void) 506void __init efi_late_init(void)
@@ -939,6 +948,8 @@ static int __init arch_parse_efi_cmdline(char *str)
939{ 948{
940 if (parse_option_str(str, "old_map")) 949 if (parse_option_str(str, "old_map"))
941 set_bit(EFI_OLD_MEMMAP, &efi.flags); 950 set_bit(EFI_OLD_MEMMAP, &efi.flags);
951 if (parse_option_str(str, "debug"))
952 set_bit(EFI_DBG, &efi.flags);
942 953
943 return 0; 954 return 0;
944} 955}
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 40e7cda52936..ed5b67338294 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -33,11 +33,10 @@
33 33
34/* 34/*
35 * To make EFI call EFI runtime service in physical addressing mode we need 35 * To make EFI call EFI runtime service in physical addressing mode we need
36 * prolog/epilog before/after the invocation to disable interrupt, to 36 * prolog/epilog before/after the invocation to claim the EFI runtime service
37 * claim EFI runtime service handler exclusively and to duplicate a memory in 37 * handler exclusively and to duplicate a memory mapping in low memory space,
38 * low memory space say 0 - 3G. 38 * say 0 - 3G.
39 */ 39 */
40static unsigned long efi_rt_eflags;
41 40
42void efi_sync_low_kernel_mappings(void) {} 41void efi_sync_low_kernel_mappings(void) {}
43void __init efi_dump_pagetable(void) {} 42void __init efi_dump_pagetable(void) {}
@@ -57,21 +56,24 @@ void __init efi_map_region(efi_memory_desc_t *md)
57void __init efi_map_region_fixed(efi_memory_desc_t *md) {} 56void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
58void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} 57void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
59 58
60void __init efi_call_phys_prolog(void) 59pgd_t * __init efi_call_phys_prolog(void)
61{ 60{
62 struct desc_ptr gdt_descr; 61 struct desc_ptr gdt_descr;
62 pgd_t *save_pgd;
63 63
64 local_irq_save(efi_rt_eflags); 64 /* Current pgd is swapper_pg_dir, we'll restore it later: */
65 65 save_pgd = swapper_pg_dir;
66 load_cr3(initial_page_table); 66 load_cr3(initial_page_table);
67 __flush_tlb_all(); 67 __flush_tlb_all();
68 68
69 gdt_descr.address = __pa(get_cpu_gdt_table(0)); 69 gdt_descr.address = __pa(get_cpu_gdt_table(0));
70 gdt_descr.size = GDT_SIZE - 1; 70 gdt_descr.size = GDT_SIZE - 1;
71 load_gdt(&gdt_descr); 71 load_gdt(&gdt_descr);
72
73 return save_pgd;
72} 74}
73 75
74void __init efi_call_phys_epilog(void) 76void __init efi_call_phys_epilog(pgd_t *save_pgd)
75{ 77{
76 struct desc_ptr gdt_descr; 78 struct desc_ptr gdt_descr;
77 79
@@ -79,10 +81,8 @@ void __init efi_call_phys_epilog(void)
79 gdt_descr.size = GDT_SIZE - 1; 81 gdt_descr.size = GDT_SIZE - 1;
80 load_gdt(&gdt_descr); 82 load_gdt(&gdt_descr);
81 83
82 load_cr3(swapper_pg_dir); 84 load_cr3(save_pgd);
83 __flush_tlb_all(); 85 __flush_tlb_all();
84
85 local_irq_restore(efi_rt_eflags);
86} 86}
87 87
88void __init efi_runtime_mkexec(void) 88void __init efi_runtime_mkexec(void)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 17e80d829df0..a0ac0f9c307f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,9 +41,6 @@
41#include <asm/realmode.h> 41#include <asm/realmode.h>
42#include <asm/time.h> 42#include <asm/time.h>
43 43
44static pgd_t *save_pgd __initdata;
45static unsigned long efi_flags __initdata;
46
47/* 44/*
48 * We allocate runtime services regions bottom-up, starting from -4G, i.e. 45 * We allocate runtime services regions bottom-up, starting from -4G, i.e.
49 * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. 46 * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
@@ -78,17 +75,18 @@ static void __init early_code_mapping_set_exec(int executable)
78 } 75 }
79} 76}
80 77
81void __init efi_call_phys_prolog(void) 78pgd_t * __init efi_call_phys_prolog(void)
82{ 79{
83 unsigned long vaddress; 80 unsigned long vaddress;
81 pgd_t *save_pgd;
82
84 int pgd; 83 int pgd;
85 int n_pgds; 84 int n_pgds;
86 85
87 if (!efi_enabled(EFI_OLD_MEMMAP)) 86 if (!efi_enabled(EFI_OLD_MEMMAP))
88 return; 87 return NULL;
89 88
90 early_code_mapping_set_exec(1); 89 early_code_mapping_set_exec(1);
91 local_irq_save(efi_flags);
92 90
93 n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); 91 n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
94 save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); 92 save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
@@ -99,24 +97,29 @@ void __init efi_call_phys_prolog(void)
99 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); 97 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
100 } 98 }
101 __flush_tlb_all(); 99 __flush_tlb_all();
100
101 return save_pgd;
102} 102}
103 103
104void __init efi_call_phys_epilog(void) 104void __init efi_call_phys_epilog(pgd_t *save_pgd)
105{ 105{
106 /* 106 /*
107 * After the lock is released, the original page table is restored. 107 * After the lock is released, the original page table is restored.
108 */ 108 */
109 int pgd; 109 int pgd_idx;
110 int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); 110 int nr_pgds;
111 111
112 if (!efi_enabled(EFI_OLD_MEMMAP)) 112 if (!save_pgd)
113 return; 113 return;
114 114
115 for (pgd = 0; pgd < n_pgds; pgd++) 115 nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
116 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); 116
117 for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
118 set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
119
117 kfree(save_pgd); 120 kfree(save_pgd);
121
118 __flush_tlb_all(); 122 __flush_tlb_all();
119 local_irq_restore(efi_flags);
120 early_code_mapping_set_exec(0); 123 early_code_mapping_set_exec(0);
121} 124}
122 125
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index c9a0838890e2..278e4da4222f 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <asm-generic/sections.h> 13#include <asm-generic/sections.h>
14#include <asm/cpu_device_id.h>
14#include <asm/imr.h> 15#include <asm/imr.h>
15#include <linux/init.h> 16#include <linux/init.h>
16#include <linux/mm.h> 17#include <linux/mm.h>
@@ -101,6 +102,12 @@ static void __init imr_self_test(void)
101 } 102 }
102} 103}
103 104
105static const struct x86_cpu_id imr_ids[] __initconst = {
106 { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
107 {}
108};
109MODULE_DEVICE_TABLE(x86cpu, imr_ids);
110
104/** 111/**
105 * imr_self_test_init - entry point for IMR driver. 112 * imr_self_test_init - entry point for IMR driver.
106 * 113 *
@@ -108,7 +115,8 @@ static void __init imr_self_test(void)
108 */ 115 */
109static int __init imr_self_test_init(void) 116static int __init imr_self_test_init(void)
110{ 117{
111 imr_self_test(); 118 if (x86_match_cpu(imr_ids))
119 imr_self_test();
112 return 0; 120 return 0;
113} 121}
114 122
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 9a2e590dd202..7fa8b3b53bc0 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -61,7 +61,7 @@ static void battery_status_changed(void)
61 61
62 if (psy) { 62 if (psy) {
63 power_supply_changed(psy); 63 power_supply_changed(psy);
64 put_device(psy->dev); 64 power_supply_put(psy);
65 } 65 }
66} 66}
67 67
@@ -71,7 +71,7 @@ static void ac_status_changed(void)
71 71
72 if (psy) { 72 if (psy) {
73 power_supply_changed(psy); 73 power_supply_changed(psy);
74 put_device(psy->dev); 74 power_supply_put(psy);
75 } 75 }
76} 76}
77 77
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 08e350e757dc..55130846ac87 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -83,7 +83,7 @@ static void battery_status_changed(void)
83 83
84 if (psy) { 84 if (psy) {
85 power_supply_changed(psy); 85 power_supply_changed(psy);
86 put_device(psy->dev); 86 power_supply_put(psy);
87 } 87 }
88} 88}
89 89
@@ -93,7 +93,7 @@ static void ac_status_changed(void)
93 93
94 if (psy) { 94 if (psy) {
95 power_supply_changed(psy); 95 power_supply_changed(psy);
96 put_device(psy->dev); 96 power_supply_put(psy);
97 } 97 }
98} 98}
99 99
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 994798548b1a..3b6ec42718e4 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
415 struct reset_args reset_args; 415 struct reset_args reset_args;
416 416
417 reset_args.sender = sender; 417 reset_args.sender = sender;
418 cpus_clear(*mask); 418 cpumask_clear(mask);
419 /* find a single cpu for each uvhub in this distribution mask */ 419 /* find a single cpu for each uvhub in this distribution mask */
420 maskbits = sizeof(struct pnmask) * BITSPERBYTE; 420 maskbits = sizeof(struct pnmask) * BITSPERBYTE;
421 /* each bit is a pnode relative to the partition base pnode */ 421 /* each bit is a pnode relative to the partition base pnode */
@@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
425 continue; 425 continue;
426 apnode = pnode + bcp->partition_base_pnode; 426 apnode = pnode + bcp->partition_base_pnode;
427 cpu = pnode_to_first_cpu(apnode, smaster); 427 cpu = pnode_to_first_cpu(apnode, smaster);
428 cpu_set(cpu, *mask); 428 cpumask_set_cpu(cpu, mask);
429 } 429 }
430 430
431 /* IPI all cpus; preemption is already disabled */ 431 /* IPI all cpus; preemption is already disabled */
@@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1126 /* don't actually do a shootdown of the local cpu */ 1126 /* don't actually do a shootdown of the local cpu */
1127 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 1127 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
1128 1128
1129 if (cpu_isset(cpu, *cpumask)) 1129 if (cpumask_test_cpu(cpu, cpumask))
1130 stat->s_ntargself++; 1130 stat->s_ntargself++;
1131 1131
1132 bau_desc = bcp->descriptor_base; 1132 bau_desc = bcp->descriptor_base;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 3e32ed5648a0..757678fb26e1 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -134,7 +134,7 @@ static void do_fpu_end(void)
134static void fix_processor_context(void) 134static void fix_processor_context(void)
135{ 135{
136 int cpu = smp_processor_id(); 136 int cpu = smp_processor_id();
137 struct tss_struct *t = &per_cpu(init_tss, cpu); 137 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
138#ifdef CONFIG_X86_64 138#ifdef CONFIG_X86_64
139 struct desc_struct *desc = get_cpu_gdt_table(cpu); 139 struct desc_struct *desc = get_cpu_gdt_table(cpu);
140 tss_desc tss; 140 tss_desc tss;
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
index 3323c2745248..a55abb9f6c5e 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/syscalls/Makefile
@@ -19,6 +19,9 @@ quiet_cmd_syshdr = SYSHDR $@
19quiet_cmd_systbl = SYSTBL $@ 19quiet_cmd_systbl = SYSTBL $@
20 cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ 20 cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
21 21
22quiet_cmd_hypercalls = HYPERCALLS $@
23 cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^)
24
22syshdr_abi_unistd_32 := i386 25syshdr_abi_unistd_32 := i386
23$(uapi)/unistd_32.h: $(syscall32) $(syshdr) 26$(uapi)/unistd_32.h: $(syscall32) $(syshdr)
24 $(call if_changed,syshdr) 27 $(call if_changed,syshdr)
@@ -47,10 +50,16 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl)
47$(out)/syscalls_64.h: $(syscall64) $(systbl) 50$(out)/syscalls_64.h: $(syscall64) $(systbl)
48 $(call if_changed,systbl) 51 $(call if_changed,systbl)
49 52
53$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh
54 $(call if_changed,hypercalls)
55
56$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
57
50uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h 58uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h
51syshdr-y += syscalls_32.h 59syshdr-y += syscalls_32.h
52syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h 60syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
53syshdr-$(CONFIG_X86_64) += syscalls_64.h 61syshdr-$(CONFIG_X86_64) += syscalls_64.h
62syshdr-$(CONFIG_XEN) += xen-hypercalls.h
54 63
55targets += $(uapisyshdr-y) $(syshdr-y) 64targets += $(uapisyshdr-y) $(syshdr-y)
56 65
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index b3560ece1c9f..ef8187f9d28d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
119110 i386 iopl sys_iopl 119110 i386 iopl sys_iopl
120111 i386 vhangup sys_vhangup 120111 i386 vhangup sys_vhangup
121112 i386 idle 121112 i386 idle
122113 i386 vm86old sys_vm86old sys32_vm86_warning 122113 i386 vm86old sys_vm86old sys_ni_syscall
123114 i386 wait4 sys_wait4 compat_sys_wait4 123114 i386 wait4 sys_wait4 compat_sys_wait4
124115 i386 swapoff sys_swapoff 124115 i386 swapoff sys_swapoff
125116 i386 sysinfo sys_sysinfo compat_sys_sysinfo 125116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
@@ -172,7 +172,7 @@
172163 i386 mremap sys_mremap 172163 i386 mremap sys_mremap
173164 i386 setresuid sys_setresuid16 173164 i386 setresuid sys_setresuid16
174165 i386 getresuid sys_getresuid16 174165 i386 getresuid sys_getresuid16
175166 i386 vm86 sys_vm86 sys32_vm86_warning 175166 i386 vm86 sys_vm86 sys_ni_syscall
176167 i386 query_module 176167 i386 query_module
177168 i386 poll sys_poll 177168 i386 poll sys_poll
178169 i386 nfsservctl 178169 i386 nfsservctl
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fbb57aa..9ef32d5f1b19 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
178169 common reboot sys_reboot 178169 common reboot sys_reboot
179170 common sethostname sys_sethostname 179170 common sethostname sys_sethostname
180171 common setdomainname sys_setdomainname 180171 common setdomainname sys_setdomainname
181172 common iopl stub_iopl 181172 common iopl sys_iopl
182173 common ioperm sys_ioperm 182173 common ioperm sys_ioperm
183174 64 create_module 183174 64 create_module
184175 common init_module sys_init_module 184175 common init_module sys_init_module
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index eafa324eb7a5..acb384d24669 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_BINFMT_ELF) += elfcore.o
21 21
22subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o 22subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
23subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o 23subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
24subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o
25 24
26else 25else
27 26
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..7e8a1a650435 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -36,22 +36,11 @@
36#endif /* CONFIG_X86_PPRO_FENCE */ 36#endif /* CONFIG_X86_PPRO_FENCE */
37#define dma_wmb() barrier() 37#define dma_wmb() barrier()
38 38
39#ifdef CONFIG_SMP
40
41#define smp_mb() mb()
42#define smp_rmb() dma_rmb()
43#define smp_wmb() barrier()
44#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
45
46#else /* CONFIG_SMP */
47
48#define smp_mb() barrier() 39#define smp_mb() barrier()
49#define smp_rmb() barrier() 40#define smp_rmb() barrier()
50#define smp_wmb() barrier() 41#define smp_wmb() barrier()
51#define set_mb(var, value) do { var = value; barrier(); } while (0) 42#define set_mb(var, value) do { var = value; barrier(); } while (0)
52 43
53#endif /* CONFIG_SMP */
54
55#define read_barrier_depends() do { } while (0) 44#define read_barrier_depends() do { } while (0)
56#define smp_read_barrier_depends() do { } while (0) 45#define smp_read_barrier_depends() do { } while (0)
57 46
@@ -64,8 +53,8 @@
64 */ 53 */
65static inline void rdtsc_barrier(void) 54static inline void rdtsc_barrier(void)
66{ 55{
67 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); 56 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
68 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 57 "lfence", X86_FEATURE_LFENCE_RDTSC);
69} 58}
70 59
71#endif 60#endif
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 25a1022dd793..0a656b727b1a 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -210,7 +210,7 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
210 210
211#define ELF_EXEC_PAGESIZE 4096 211#define ELF_EXEC_PAGESIZE 4096
212 212
213#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) 213#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
214 214
215extern long elf_aux_hwcap; 215extern long elf_aux_hwcap;
216#define ELF_HWCAP (elf_aux_hwcap) 216#define ELF_HWCAP (elf_aux_hwcap)
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 8e08176f0bcb..5c0b711d2433 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -8,9 +8,7 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <asm/unistd.h> 9#include <asm/unistd.h>
10#include <os.h> 10#include <os.h>
11#include <proc_mm.h>
12#include <skas.h> 11#include <skas.h>
13#include <skas_ptrace.h>
14#include <sysdep/tls.h> 12#include <sysdep/tls.h>
15 13
16extern int modify_ldt(int func, void *ptr, unsigned long bytecount); 14extern int modify_ldt(int func, void *ptr, unsigned long bytecount);
@@ -19,105 +17,20 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func,
19 struct user_desc *desc, void **addr, int done) 17 struct user_desc *desc, void **addr, int done)
20{ 18{
21 long res; 19 long res;
22 20 void *stub_addr;
23 if (proc_mm) { 21 res = syscall_stub_data(mm_idp, (unsigned long *)desc,
24 /* 22 (sizeof(*desc) + sizeof(long) - 1) &
25 * This is a special handling for the case, that the mm to 23 ~(sizeof(long) - 1),
26 * modify isn't current->active_mm. 24 addr, &stub_addr);
27 * If this is called directly by modify_ldt, 25 if (!res) {
28 * (current->active_mm->context.skas.u == mm_idp) 26 unsigned long args[] = { func,
29 * will be true. So no call to __switch_mm(mm_idp) is done. 27 (unsigned long)stub_addr,
30 * If this is called in case of init_new_ldt or PTRACE_LDT, 28 sizeof(*desc),
31 * mm_idp won't belong to current->active_mm, but child->mm. 29 0, 0, 0 };
32 * So we need to switch child's mm into our userspace, then 30 res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
33 * later switch back. 31 0, addr, done);
34 *
35 * Note: I'm unsure: should interrupts be disabled here?
36 */
37 if (!current->active_mm || current->active_mm == &init_mm ||
38 mm_idp != &current->active_mm->context.id)
39 __switch_mm(mm_idp);
40 }
41
42 if (ptrace_ldt) {
43 struct ptrace_ldt ldt_op = (struct ptrace_ldt) {
44 .func = func,
45 .ptr = desc,
46 .bytecount = sizeof(*desc)};
47 u32 cpu;
48 int pid;
49
50 if (!proc_mm)
51 pid = mm_idp->u.pid;
52 else {
53 cpu = get_cpu();
54 pid = userspace_pid[cpu];
55 }
56
57 res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op);
58
59 if (proc_mm)
60 put_cpu();
61 }
62 else {
63 void *stub_addr;
64 res = syscall_stub_data(mm_idp, (unsigned long *)desc,
65 (sizeof(*desc) + sizeof(long) - 1) &
66 ~(sizeof(long) - 1),
67 addr, &stub_addr);
68 if (!res) {
69 unsigned long args[] = { func,
70 (unsigned long)stub_addr,
71 sizeof(*desc),
72 0, 0, 0 };
73 res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
74 0, addr, done);
75 }
76 } 32 }
77 33
78 if (proc_mm) {
79 /*
80 * This is the second part of special handling, that makes
81 * PTRACE_LDT possible to implement.
82 */
83 if (current->active_mm && current->active_mm != &init_mm &&
84 mm_idp != &current->active_mm->context.id)
85 __switch_mm(&current->active_mm->context.id);
86 }
87
88 return res;
89}
90
91static long read_ldt_from_host(void __user * ptr, unsigned long bytecount)
92{
93 int res, n;
94 struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) {
95 .func = 0,
96 .bytecount = bytecount,
97 .ptr = kmalloc(bytecount, GFP_KERNEL)};
98 u32 cpu;
99
100 if (ptrace_ldt.ptr == NULL)
101 return -ENOMEM;
102
103 /*
104 * This is called from sys_modify_ldt only, so userspace_pid gives
105 * us the right number
106 */
107
108 cpu = get_cpu();
109 res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt);
110 put_cpu();
111 if (res < 0)
112 goto out;
113
114 n = copy_to_user(ptr, ptrace_ldt.ptr, res);
115 if (n != 0)
116 res = -EFAULT;
117
118 out:
119 kfree(ptrace_ldt.ptr);
120
121 return res; 34 return res;
122} 35}
123 36
@@ -145,9 +58,6 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
145 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; 58 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
146 err = bytecount; 59 err = bytecount;
147 60
148 if (ptrace_ldt)
149 return read_ldt_from_host(ptr, bytecount);
150
151 mutex_lock(&ldt->lock); 61 mutex_lock(&ldt->lock);
152 if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { 62 if (ldt->entry_count <= LDT_DIRECT_ENTRIES) {
153 size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; 63 size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES;
@@ -229,17 +139,11 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
229 goto out; 139 goto out;
230 } 140 }
231 141
232 if (!ptrace_ldt) 142 mutex_lock(&ldt->lock);
233 mutex_lock(&ldt->lock);
234 143
235 err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); 144 err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1);
236 if (err) 145 if (err)
237 goto out_unlock; 146 goto out_unlock;
238 else if (ptrace_ldt) {
239 /* With PTRACE_LDT available, this is used as a flag only */
240 ldt->entry_count = 1;
241 goto out;
242 }
243 147
244 if (ldt_info.entry_number >= ldt->entry_count && 148 if (ldt_info.entry_number >= ldt->entry_count &&
245 ldt_info.entry_number >= LDT_DIRECT_ENTRIES) { 149 ldt_info.entry_number >= LDT_DIRECT_ENTRIES) {
@@ -393,91 +297,56 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
393 int i; 297 int i;
394 long page, err=0; 298 long page, err=0;
395 void *addr = NULL; 299 void *addr = NULL;
396 struct proc_mm_op copy;
397 300
398 301
399 if (!ptrace_ldt) 302 mutex_init(&new_mm->arch.ldt.lock);
400 mutex_init(&new_mm->arch.ldt.lock);
401 303
402 if (!from_mm) { 304 if (!from_mm) {
403 memset(&desc, 0, sizeof(desc)); 305 memset(&desc, 0, sizeof(desc));
404 /* 306 /*
405 * We have to initialize a clean ldt. 307 * Now we try to retrieve info about the ldt, we
308 * inherited from the host. All ldt-entries found
309 * will be reset in the following loop
406 */ 310 */
407 if (proc_mm) { 311 ldt_get_host_info();
408 /* 312 for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
409 * If the new mm was created using proc_mm, host's 313 desc.entry_number = *num_p;
410 * default-ldt currently is assigned, which normally 314 err = write_ldt_entry(&new_mm->id, 1, &desc,
411 * contains the call-gates for lcall7 and lcall27. 315 &addr, *(num_p + 1) == -1);
412 * To remove these gates, we simply write an empty 316 if (err)
413 * entry as number 0 to the host. 317 break;
414 */
415 err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1);
416 }
417 else{
418 /*
419 * Now we try to retrieve info about the ldt, we
420 * inherited from the host. All ldt-entries found
421 * will be reset in the following loop
422 */
423 ldt_get_host_info();
424 for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
425 desc.entry_number = *num_p;
426 err = write_ldt_entry(&new_mm->id, 1, &desc,
427 &addr, *(num_p + 1) == -1);
428 if (err)
429 break;
430 }
431 } 318 }
432 new_mm->arch.ldt.entry_count = 0; 319 new_mm->arch.ldt.entry_count = 0;
433 320
434 goto out; 321 goto out;
435 } 322 }
436 323
437 if (proc_mm) { 324 /*
438 /* 325 * Our local LDT is used to supply the data for
439 * We have a valid from_mm, so we now have to copy the LDT of 326 * modify_ldt(READLDT), if PTRACE_LDT isn't available,
440 * from_mm to new_mm, because using proc_mm an new mm with 327 * i.e., we have to use the stub for modify_ldt, which
441 * an empty/default LDT was created in new_mm() 328 * can't handle the big read buffer of up to 64kB.
442 */ 329 */
443 copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS, 330 mutex_lock(&from_mm->arch.ldt.lock);
444 .u = 331 if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
445 { .copy_segments = 332 memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
446 from_mm->id.u.mm_fd } } ); 333 sizeof(new_mm->arch.ldt.u.entries));
447 i = os_write_file(new_mm->id.u.mm_fd, &copy, sizeof(copy)); 334 else {
448 if (i != sizeof(copy)) 335 i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
449 printk(KERN_ERR "new_mm : /proc/mm copy_segments " 336 while (i-->0) {
450 "failed, err = %d\n", -i); 337 page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
451 } 338 if (!page) {
452 339 err = -ENOMEM;
453 if (!ptrace_ldt) { 340 break;
454 /*
455 * Our local LDT is used to supply the data for
456 * modify_ldt(READLDT), if PTRACE_LDT isn't available,
457 * i.e., we have to use the stub for modify_ldt, which
458 * can't handle the big read buffer of up to 64kB.
459 */
460 mutex_lock(&from_mm->arch.ldt.lock);
461 if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
462 memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
463 sizeof(new_mm->arch.ldt.u.entries));
464 else {
465 i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
466 while (i-->0) {
467 page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
468 if (!page) {
469 err = -ENOMEM;
470 break;
471 }
472 new_mm->arch.ldt.u.pages[i] =
473 (struct ldt_entry *) page;
474 memcpy(new_mm->arch.ldt.u.pages[i],
475 from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
476 } 341 }
342 new_mm->arch.ldt.u.pages[i] =
343 (struct ldt_entry *) page;
344 memcpy(new_mm->arch.ldt.u.pages[i],
345 from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
477 } 346 }
478 new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
479 mutex_unlock(&from_mm->arch.ldt.lock);
480 } 347 }
348 new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
349 mutex_unlock(&from_mm->arch.ldt.lock);
481 350
482 out: 351 out:
483 return err; 352 return err;
@@ -488,7 +357,7 @@ void free_ldt(struct mm_context *mm)
488{ 357{
489 int i; 358 int i;
490 359
491 if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { 360 if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
492 i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; 361 i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
493 while (i-- > 0) 362 while (i-- > 0)
494 free_page((long) mm->arch.ldt.u.pages[i]); 363 free_page((long) mm->arch.ldt.u.pages[i]);
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
index a26086b8a800..b6f2437ec29c 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_32.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -27,9 +27,6 @@ struct faultinfo {
27/* This is Page Fault */ 27/* This is Page Fault */
28#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) 28#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14)
29 29
30/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */
31#define SEGV_MAYBE_FIXABLE(fi) ((fi)->trap_no == 0 && ptrace_faultinfo)
32
33#define PTRACE_FULL_FAULTINFO 0 30#define PTRACE_FULL_FAULTINFO 0
34 31
35#endif 32#endif
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
index f811cbe15d62..ee88f88974ea 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_64.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -27,9 +27,6 @@ struct faultinfo {
27/* This is Page Fault */ 27/* This is Page Fault */
28#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) 28#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14)
29 29
30/* No broken SKAS API, which doesn't pass trap_no, here. */
31#define SEGV_MAYBE_FIXABLE(fi) 0
32
33#define PTRACE_FULL_FAULTINFO 1 30#define PTRACE_FULL_FAULTINFO 1
34 31
35#endif 32#endif
diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h
deleted file mode 100644
index 453febe98993..000000000000
--- a/arch/x86/um/shared/sysdep/skas_ptrace.h
+++ /dev/null
@@ -1,22 +0,0 @@
1/*
2 * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
3 * Licensed under the GPL
4 */
5
6#ifndef __SYSDEP_X86_SKAS_PTRACE_H
7#define __SYSDEP_X86_SKAS_PTRACE_H
8
9struct ptrace_faultinfo {
10 int is_write;
11 unsigned long addr;
12};
13
14struct ptrace_ldt {
15 int func;
16 void *ptr;
17 unsigned long bytecount;
18};
19
20#define PTRACE_LDT 54
21
22#endif
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 0c8c32bfd792..592491d1d70d 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -549,13 +549,6 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
549 if (err) 549 if (err)
550 return err; 550 return err;
551 551
552 /* Set up registers for signal handler */
553 {
554 struct exec_domain *ed = current_thread_info()->exec_domain;
555 if (unlikely(ed && ed->signal_invmap && sig < 32))
556 sig = ed->signal_invmap[sig];
557 }
558
559 PT_REGS_SP(regs) = (unsigned long) frame; 552 PT_REGS_SP(regs) = (unsigned long) frame;
560 PT_REGS_DI(regs) = sig; 553 PT_REGS_DI(regs) = sig;
561 /* In case the signal handler was declared without prototypes */ 554 /* In case the signal handler was declared without prototypes */
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 5cdfa9db2217..a75d8700472a 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -16,7 +16,7 @@
16 */ 16 */
17 17
18/* Not going to be implemented by UML, since we have no hardware. */ 18/* Not going to be implemented by UML, since we have no hardware. */
19#define stub_iopl sys_ni_syscall 19#define sys_iopl sys_ni_syscall
20#define sys_ioperm sys_ni_syscall 20#define sys_ioperm sys_ni_syscall
21 21
22/* 22/*
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 7b9be9822724..275a3a8b78af 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -51,7 +51,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
51$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE 51$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
52 $(call if_changed,vdso) 52 $(call if_changed,vdso)
53 53
54HOST_EXTRACFLAGS += -I$(srctree)/tools/include 54HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi
55hostprogs-y += vdso2c 55hostprogs-y += vdso2c
56 56
57quiet_cmd_vdso2c = VDSO2C $@ 57quiet_cmd_vdso2c = VDSO2C $@
@@ -206,4 +206,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
206PHONY += vdso_install $(vdso_img_insttargets) 206PHONY += vdso_install $(vdso_img_insttargets)
207vdso_install: $(vdso_img_insttargets) FORCE 207vdso_install: $(vdso_img_insttargets) FORCE
208 208
209clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* 209clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 9793322751e0..40d2473836c9 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode)
82 cycle_t ret; 82 cycle_t ret;
83 u64 last; 83 u64 last;
84 u32 version; 84 u32 version;
85 u32 migrate_count;
85 u8 flags; 86 u8 flags;
86 unsigned cpu, cpu1; 87 unsigned cpu, cpu1;
87 88
88 89
89 /* 90 /*
90 * Note: hypervisor must guarantee that: 91 * When looping to get a consistent (time-info, tsc) pair, we
91 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. 92 * also need to deal with the possibility we can switch vcpus,
92 * 2. that per-CPU pvclock time info is updated if the 93 * so make sure we always re-fetch time-info for the current vcpu.
93 * underlying CPU changes.
94 * 3. that version is increased whenever underlying CPU
95 * changes.
96 *
97 */ 94 */
98 do { 95 do {
99 cpu = __getcpu() & VGETCPU_CPU_MASK; 96 cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -102,20 +99,27 @@ static notrace cycle_t vread_pvclock(int *mode)
102 * __getcpu() calls (Gleb). 99 * __getcpu() calls (Gleb).
103 */ 100 */
104 101
105 pvti = get_pvti(cpu); 102 /* Make sure migrate_count will change if we leave the VCPU. */
103 do {
104 pvti = get_pvti(cpu);
105 migrate_count = pvti->migrate_count;
106
107 cpu1 = cpu;
108 cpu = __getcpu() & VGETCPU_CPU_MASK;
109 } while (unlikely(cpu != cpu1));
106 110
107 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); 111 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
108 112
109 /* 113 /*
110 * Test we're still on the cpu as well as the version. 114 * Test we're still on the cpu as well as the version.
111 * We could have been migrated just after the first 115 * - We must read TSC of pvti's VCPU.
112 * vgetcpu but before fetching the version, so we 116 * - KVM doesn't follow the versioning protocol, so data could
113 * wouldn't notice a version change. 117 * change before version if we left the VCPU.
114 */ 118 */
115 cpu1 = __getcpu() & VGETCPU_CPU_MASK; 119 smp_rmb();
116 } while (unlikely(cpu != cpu1 || 120 } while (unlikely((pvti->pvti.version & 1) ||
117 (pvti->pvti.version & 1) || 121 pvti->pvti.version != version ||
118 pvti->pvti.version != version)); 122 pvti->migrate_count != migrate_count));
119 123
120 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) 124 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
121 *mode = VCLOCK_NONE; 125 *mode = VCLOCK_NONE;
diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S
index 5415b5613d55..6b286bb5251c 100644
--- a/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -19,8 +19,6 @@ __kernel_vsyscall:
19.Lpush_ebp: 19.Lpush_ebp:
20 movl %ecx, %ebp 20 movl %ecx, %ebp
21 syscall 21 syscall
22 movl $__USER32_DS, %ecx
23 movl %ecx, %ss
24 movl %ebp, %ecx 22 movl %ebp, %ecx
25 popl %ebp 23 popl %ebp
26.Lpop_ebp: 24.Lpop_ebp:
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 7005ced5d1ad..70e060ad879a 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
7#include <xen/xen.h> 7#include <xen/xen.h>
8#include <xen/interface/physdev.h> 8#include <xen/interface/physdev.h>
9#include "xen-ops.h" 9#include "xen-ops.h"
10#include "smp.h"
10 11
11static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) 12static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
12{ 13{
@@ -28,7 +29,186 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
28 return 0xfd; 29 return 0xfd;
29} 30}
30 31
32static unsigned long xen_set_apic_id(unsigned int x)
33{
34 WARN_ON(1);
35 return x;
36}
37
38static unsigned int xen_get_apic_id(unsigned long x)
39{
40 return ((x)>>24) & 0xFFu;
41}
42
43static u32 xen_apic_read(u32 reg)
44{
45 struct xen_platform_op op = {
46 .cmd = XENPF_get_cpuinfo,
47 .interface_version = XENPF_INTERFACE_VERSION,
48 .u.pcpu_info.xen_cpuid = 0,
49 };
50 int ret = 0;
51
52 /* Shouldn't need this as APIC is turned off for PV, and we only
53 * get called on the bootup processor. But just in case. */
54 if (!xen_initial_domain() || smp_processor_id())
55 return 0;
56
57 if (reg == APIC_LVR)
58 return 0x10;
59#ifdef CONFIG_X86_32
60 if (reg == APIC_LDR)
61 return SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
62#endif
63 if (reg != APIC_ID)
64 return 0;
65
66 ret = HYPERVISOR_dom0_op(&op);
67 if (ret)
68 return 0;
69
70 return op.u.pcpu_info.apic_id << 24;
71}
72
73static void xen_apic_write(u32 reg, u32 val)
74{
75 /* Warn to see if there's any stray references */
76 WARN(1,"register: %x, value: %x\n", reg, val);
77}
78
79static u64 xen_apic_icr_read(void)
80{
81 return 0;
82}
83
84static void xen_apic_icr_write(u32 low, u32 id)
85{
86 /* Warn to see if there's any stray references */
87 WARN_ON(1);
88}
89
90static u32 xen_safe_apic_wait_icr_idle(void)
91{
92 return 0;
93}
94
95static int xen_apic_probe_pv(void)
96{
97 if (xen_pv_domain())
98 return 1;
99
100 return 0;
101}
102
103static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
104{
105 return xen_pv_domain();
106}
107
108static int xen_id_always_valid(int apicid)
109{
110 return 1;
111}
112
113static int xen_id_always_registered(void)
114{
115 return 1;
116}
117
118static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
119{
120 return initial_apic_id >> index_msb;
121}
122
123#ifdef CONFIG_X86_32
124static int xen_x86_32_early_logical_apicid(int cpu)
125{
126 /* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */
127 return 1 << cpu;
128}
129#endif
130
131static void xen_noop(void)
132{
133}
134
135static void xen_silent_inquire(int apicid)
136{
137}
138
139static struct apic xen_pv_apic = {
140 .name = "Xen PV",
141 .probe = xen_apic_probe_pv,
142 .acpi_madt_oem_check = xen_madt_oem_check,
143 .apic_id_valid = xen_id_always_valid,
144 .apic_id_registered = xen_id_always_registered,
145
146 /* .irq_delivery_mode - used in native_compose_msi_msg only */
147 /* .irq_dest_mode - used in native_compose_msi_msg only */
148
149 .target_cpus = default_target_cpus,
150 .disable_esr = 0,
151 /* .dest_logical - default_send_IPI_ use it but we use our own. */
152 .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
153
154 .vector_allocation_domain = flat_vector_allocation_domain,
155 .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */
156
157 .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */
158 .setup_apic_routing = NULL,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */
161 .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */
162 .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */
163
164 .get_apic_id = xen_get_apic_id,
165 .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
166 .apic_id_mask = 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */
167
168 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
169
170#ifdef CONFIG_SMP
171 .send_IPI_mask = xen_send_IPI_mask,
172 .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
173 .send_IPI_allbutself = xen_send_IPI_allbutself,
174 .send_IPI_all = xen_send_IPI_all,
175 .send_IPI_self = xen_send_IPI_self,
176#endif
177 /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */
178 .inquire_remote_apic = xen_silent_inquire,
179
180 .read = xen_apic_read,
181 .write = xen_apic_write,
182 .eoi_write = xen_apic_write,
183
184 .icr_read = xen_apic_icr_read,
185 .icr_write = xen_apic_icr_write,
186 .wait_icr_idle = xen_noop,
187 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
188
189#ifdef CONFIG_X86_32
190 /* generic_processor_info and setup_local_APIC. */
191 .x86_32_early_logical_apicid = xen_x86_32_early_logical_apicid,
192#endif
193};
194
195static void __init xen_apic_check(void)
196{
197 if (apic == &xen_pv_apic)
198 return;
199
200 pr_info("Switched APIC routing from %s to %s.\n", apic->name,
201 xen_pv_apic.name);
202 apic = &xen_pv_apic;
203}
31void __init xen_init_apic(void) 204void __init xen_init_apic(void)
32{ 205{
33 x86_io_apic_ops.read = xen_io_apic_read; 206 x86_io_apic_ops.read = xen_io_apic_read;
207 /* On PV guests the APIC CPUID bit is disabled so none of the
208 * routines end up executing. */
209 if (!xen_initial_domain())
210 apic = &xen_pv_apic;
211
212 x86_platform.apic_post_init = xen_apic_check;
34} 213}
214apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f563076d..94578efd3067 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss,
912 mcs = xen_mc_entry(0); 912 mcs = xen_mc_entry(0);
913 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 913 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
914 xen_mc_issue(PARAVIRT_LAZY_CPU); 914 xen_mc_issue(PARAVIRT_LAZY_CPU);
915 tss->x86_tss.sp0 = thread->sp0;
915} 916}
916 917
917static void xen_set_iopl_mask(unsigned mask) 918static void xen_set_iopl_mask(unsigned mask)
@@ -927,92 +928,6 @@ static void xen_io_delay(void)
927{ 928{
928} 929}
929 930
930#ifdef CONFIG_X86_LOCAL_APIC
931static unsigned long xen_set_apic_id(unsigned int x)
932{
933 WARN_ON(1);
934 return x;
935}
936static unsigned int xen_get_apic_id(unsigned long x)
937{
938 return ((x)>>24) & 0xFFu;
939}
940static u32 xen_apic_read(u32 reg)
941{
942 struct xen_platform_op op = {
943 .cmd = XENPF_get_cpuinfo,
944 .interface_version = XENPF_INTERFACE_VERSION,
945 .u.pcpu_info.xen_cpuid = 0,
946 };
947 int ret = 0;
948
949 /* Shouldn't need this as APIC is turned off for PV, and we only
950 * get called on the bootup processor. But just in case. */
951 if (!xen_initial_domain() || smp_processor_id())
952 return 0;
953
954 if (reg == APIC_LVR)
955 return 0x10;
956
957 if (reg != APIC_ID)
958 return 0;
959
960 ret = HYPERVISOR_dom0_op(&op);
961 if (ret)
962 return 0;
963
964 return op.u.pcpu_info.apic_id << 24;
965}
966
967static void xen_apic_write(u32 reg, u32 val)
968{
969 /* Warn to see if there's any stray references */
970 WARN_ON(1);
971}
972
973static u64 xen_apic_icr_read(void)
974{
975 return 0;
976}
977
978static void xen_apic_icr_write(u32 low, u32 id)
979{
980 /* Warn to see if there's any stray references */
981 WARN_ON(1);
982}
983
984static void xen_apic_wait_icr_idle(void)
985{
986 return;
987}
988
989static u32 xen_safe_apic_wait_icr_idle(void)
990{
991 return 0;
992}
993
994static void set_xen_basic_apic_ops(void)
995{
996 apic->read = xen_apic_read;
997 apic->write = xen_apic_write;
998 apic->icr_read = xen_apic_icr_read;
999 apic->icr_write = xen_apic_icr_write;
1000 apic->wait_icr_idle = xen_apic_wait_icr_idle;
1001 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
1002 apic->set_apic_id = xen_set_apic_id;
1003 apic->get_apic_id = xen_get_apic_id;
1004
1005#ifdef CONFIG_SMP
1006 apic->send_IPI_allbutself = xen_send_IPI_allbutself;
1007 apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
1008 apic->send_IPI_mask = xen_send_IPI_mask;
1009 apic->send_IPI_all = xen_send_IPI_all;
1010 apic->send_IPI_self = xen_send_IPI_self;
1011#endif
1012}
1013
1014#endif
1015
1016static void xen_clts(void) 931static void xen_clts(void)
1017{ 932{
1018 struct multicall_space mcs; 933 struct multicall_space mcs;
@@ -1618,7 +1533,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
1618 /* 1533 /*
1619 * set up the basic apic ops. 1534 * set up the basic apic ops.
1620 */ 1535 */
1621 set_xen_basic_apic_ops(); 1536 xen_init_apic();
1622#endif 1537#endif
1623 1538
1624 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1539 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1731,8 +1646,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
1731 if (HYPERVISOR_dom0_op(&op) == 0) 1646 if (HYPERVISOR_dom0_op(&op) == 0)
1732 boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; 1647 boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
1733 1648
1734 xen_init_apic();
1735
1736 /* Make sure ACS will be enabled */ 1649 /* Make sure ACS will be enabled */
1737 pci_request_acs(); 1650 pci_request_acs();
1738 1651
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..dd151b2045b0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
502} 502}
503PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 503PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
504 504
505#if PAGETABLE_LEVELS == 4 505#if CONFIG_PGTABLE_LEVELS == 4
506__visible pudval_t xen_pud_val(pud_t pud) 506__visible pudval_t xen_pud_val(pud_t pud)
507{ 507{
508 return pte_mfn_to_pfn(pud.pud); 508 return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
589 589
590 xen_mc_issue(PARAVIRT_LAZY_MMU); 590 xen_mc_issue(PARAVIRT_LAZY_MMU);
591} 591}
592#endif /* PAGETABLE_LEVELS == 4 */ 592#endif /* CONFIG_PGTABLE_LEVELS == 4 */
593 593
594/* 594/*
595 * (Yet another) pagetable walker. This one is intended for pinning a 595 * (Yet another) pagetable walker. This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
1628 xen_release_ptpage(pfn, PT_PMD); 1628 xen_release_ptpage(pfn, PT_PMD);
1629} 1629}
1630 1630
1631#if PAGETABLE_LEVELS == 4 1631#if CONFIG_PGTABLE_LEVELS == 4
1632static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) 1632static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1633{ 1633{
1634 xen_alloc_ptpage(mm, pfn, PT_PUD); 1634 xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
2046 pv_mmu_ops.set_pte = xen_set_pte; 2046 pv_mmu_ops.set_pte = xen_set_pte;
2047 pv_mmu_ops.set_pmd = xen_set_pmd; 2047 pv_mmu_ops.set_pmd = xen_set_pmd;
2048 pv_mmu_ops.set_pud = xen_set_pud; 2048 pv_mmu_ops.set_pud = xen_set_pud;
2049#if PAGETABLE_LEVELS == 4 2049#if CONFIG_PGTABLE_LEVELS == 4
2050 pv_mmu_ops.set_pgd = xen_set_pgd; 2050 pv_mmu_ops.set_pgd = xen_set_pgd;
2051#endif 2051#endif
2052 2052
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
2056 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 2056 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2057 pv_mmu_ops.release_pte = xen_release_pte; 2057 pv_mmu_ops.release_pte = xen_release_pte;
2058 pv_mmu_ops.release_pmd = xen_release_pmd; 2058 pv_mmu_ops.release_pmd = xen_release_pmd;
2059#if PAGETABLE_LEVELS == 4 2059#if CONFIG_PGTABLE_LEVELS == 4
2060 pv_mmu_ops.alloc_pud = xen_alloc_pud; 2060 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2061 pv_mmu_ops.release_pud = xen_release_pud; 2061 pv_mmu_ops.release_pud = xen_release_pud;
2062#endif 2062#endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2122 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), 2122 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2123 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), 2123 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2124 2124
2125#if PAGETABLE_LEVELS == 4 2125#if CONFIG_PGTABLE_LEVELS == 4
2126 .pud_val = PV_CALLEE_SAVE(xen_pud_val), 2126 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2127 .make_pud = PV_CALLEE_SAVE(xen_make_pud), 2127 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2128 .set_pgd = xen_set_pgd_hyper, 2128 .set_pgd = xen_set_pgd_hyper,
2129 2129
2130 .alloc_pud = xen_alloc_pmd_init, 2130 .alloc_pud = xen_alloc_pmd_init,
2131 .release_pud = xen_release_pmd_init, 2131 .release_pud = xen_release_pmd_init,
2132#endif /* PAGETABLE_LEVELS == 4 */ 2132#endif /* CONFIG_PGTABLE_LEVELS == 4 */
2133 2133
2134 .activate_mm = xen_activate_mm, 2134 .activate_mm = xen_activate_mm,
2135 .dup_mmap = xen_dup_mmap, 2135 .dup_mmap = xen_dup_mmap,
@@ -2436,99 +2436,11 @@ void __init xen_hvm_init_mmu_ops(void)
2436} 2436}
2437#endif 2437#endif
2438 2438
2439#ifdef CONFIG_XEN_PVH
2440/*
2441 * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
2442 * space creating new guest on pvh dom0 and needing to map domU pages.
2443 */
2444static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
2445 unsigned int domid)
2446{
2447 int rc, err = 0;
2448 xen_pfn_t gpfn = lpfn;
2449 xen_ulong_t idx = fgfn;
2450
2451 struct xen_add_to_physmap_range xatp = {
2452 .domid = DOMID_SELF,
2453 .foreign_domid = domid,
2454 .size = 1,
2455 .space = XENMAPSPACE_gmfn_foreign,
2456 };
2457 set_xen_guest_handle(xatp.idxs, &idx);
2458 set_xen_guest_handle(xatp.gpfns, &gpfn);
2459 set_xen_guest_handle(xatp.errs, &err);
2460
2461 rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
2462 if (rc < 0)
2463 return rc;
2464 return err;
2465}
2466
2467static int xlate_remove_from_p2m(unsigned long spfn, int count)
2468{
2469 struct xen_remove_from_physmap xrp;
2470 int i, rc;
2471
2472 for (i = 0; i < count; i++) {
2473 xrp.domid = DOMID_SELF;
2474 xrp.gpfn = spfn+i;
2475 rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
2476 if (rc)
2477 break;
2478 }
2479 return rc;
2480}
2481
2482struct xlate_remap_data {
2483 unsigned long fgfn; /* foreign domain's gfn */
2484 pgprot_t prot;
2485 domid_t domid;
2486 int index;
2487 struct page **pages;
2488};
2489
2490static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
2491 void *data)
2492{
2493 int rc;
2494 struct xlate_remap_data *remap = data;
2495 unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
2496 pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
2497
2498 rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
2499 if (rc)
2500 return rc;
2501 native_set_pte(ptep, pteval);
2502
2503 return 0;
2504}
2505
2506static int xlate_remap_gfn_range(struct vm_area_struct *vma,
2507 unsigned long addr, unsigned long mfn,
2508 int nr, pgprot_t prot, unsigned domid,
2509 struct page **pages)
2510{
2511 int err;
2512 struct xlate_remap_data pvhdata;
2513
2514 BUG_ON(!pages);
2515
2516 pvhdata.fgfn = mfn;
2517 pvhdata.prot = prot;
2518 pvhdata.domid = domid;
2519 pvhdata.index = 0;
2520 pvhdata.pages = pages;
2521 err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
2522 xlate_map_pte_fn, &pvhdata);
2523 flush_tlb_all();
2524 return err;
2525}
2526#endif
2527
2528#define REMAP_BATCH_SIZE 16 2439#define REMAP_BATCH_SIZE 16
2529 2440
2530struct remap_data { 2441struct remap_data {
2531 unsigned long mfn; 2442 xen_pfn_t *mfn;
2443 bool contiguous;
2532 pgprot_t prot; 2444 pgprot_t prot;
2533 struct mmu_update *mmu_update; 2445 struct mmu_update *mmu_update;
2534}; 2446};
@@ -2537,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2537 unsigned long addr, void *data) 2449 unsigned long addr, void *data)
2538{ 2450{
2539 struct remap_data *rmd = data; 2451 struct remap_data *rmd = data;
2540 pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); 2452 pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
2453
2454 /* If we have a contigious range, just update the mfn itself,
2455 else update pointer to be "next mfn". */
2456 if (rmd->contiguous)
2457 (*rmd->mfn)++;
2458 else
2459 rmd->mfn++;
2541 2460
2542 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; 2461 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2543 rmd->mmu_update->val = pte_val_ma(pte); 2462 rmd->mmu_update->val = pte_val_ma(pte);
@@ -2546,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2546 return 0; 2465 return 0;
2547} 2466}
2548 2467
2549int xen_remap_domain_mfn_range(struct vm_area_struct *vma, 2468static int do_remap_mfn(struct vm_area_struct *vma,
2550 unsigned long addr, 2469 unsigned long addr,
2551 xen_pfn_t mfn, int nr, 2470 xen_pfn_t *mfn, int nr,
2552 pgprot_t prot, unsigned domid, 2471 int *err_ptr, pgprot_t prot,
2553 struct page **pages) 2472 unsigned domid,
2554 2473 struct page **pages)
2555{ 2474{
2475 int err = 0;
2556 struct remap_data rmd; 2476 struct remap_data rmd;
2557 struct mmu_update mmu_update[REMAP_BATCH_SIZE]; 2477 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2558 int batch;
2559 unsigned long range; 2478 unsigned long range;
2560 int err = 0; 2479 int mapped = 0;
2561 2480
2562 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); 2481 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2563 2482
2564 if (xen_feature(XENFEAT_auto_translated_physmap)) { 2483 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2565#ifdef CONFIG_XEN_PVH 2484#ifdef CONFIG_XEN_PVH
2566 /* We need to update the local page tables and the xen HAP */ 2485 /* We need to update the local page tables and the xen HAP */
2567 return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, 2486 return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
2568 domid, pages); 2487 prot, domid, pages);
2569#else 2488#else
2570 return -EINVAL; 2489 return -EINVAL;
2571#endif 2490#endif
@@ -2573,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2573 2492
2574 rmd.mfn = mfn; 2493 rmd.mfn = mfn;
2575 rmd.prot = prot; 2494 rmd.prot = prot;
2495 /* We use the err_ptr to indicate if there we are doing a contigious
2496 * mapping or a discontigious mapping. */
2497 rmd.contiguous = !err_ptr;
2576 2498
2577 while (nr) { 2499 while (nr) {
2578 batch = min(REMAP_BATCH_SIZE, nr); 2500 int index = 0;
2501 int done = 0;
2502 int batch = min(REMAP_BATCH_SIZE, nr);
2503 int batch_left = batch;
2579 range = (unsigned long)batch << PAGE_SHIFT; 2504 range = (unsigned long)batch << PAGE_SHIFT;
2580 2505
2581 rmd.mmu_update = mmu_update; 2506 rmd.mmu_update = mmu_update;
@@ -2584,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2584 if (err) 2509 if (err)
2585 goto out; 2510 goto out;
2586 2511
2587 err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); 2512 /* We record the error for each page that gives an error, but
2588 if (err < 0) 2513 * continue mapping until the whole set is done */
2589 goto out; 2514 do {
2515 int i;
2516
2517 err = HYPERVISOR_mmu_update(&mmu_update[index],
2518 batch_left, &done, domid);
2519
2520 /*
2521 * @err_ptr may be the same buffer as @mfn, so
2522 * only clear it after each chunk of @mfn is
2523 * used.
2524 */
2525 if (err_ptr) {
2526 for (i = index; i < index + done; i++)
2527 err_ptr[i] = 0;
2528 }
2529 if (err < 0) {
2530 if (!err_ptr)
2531 goto out;
2532 err_ptr[i] = err;
2533 done++; /* Skip failed frame. */
2534 } else
2535 mapped += done;
2536 batch_left -= done;
2537 index += done;
2538 } while (batch_left);
2590 2539
2591 nr -= batch; 2540 nr -= batch;
2592 addr += range; 2541 addr += range;
2542 if (err_ptr)
2543 err_ptr += batch;
2593 } 2544 }
2594
2595 err = 0;
2596out: 2545out:
2597 2546
2598 xen_flush_tlb_all(); 2547 xen_flush_tlb_all();
2599 2548
2600 return err; 2549 return err < 0 ? err : mapped;
2550}
2551
2552int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2553 unsigned long addr,
2554 xen_pfn_t mfn, int nr,
2555 pgprot_t prot, unsigned domid,
2556 struct page **pages)
2557{
2558 return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
2601} 2559}
2602EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); 2560EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2603 2561
2562int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
2563 unsigned long addr,
2564 xen_pfn_t *mfn, int nr,
2565 int *err_ptr, pgprot_t prot,
2566 unsigned domid, struct page **pages)
2567{
2568 /* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
2569 * and the consequences later is quite hard to detect what the actual
2570 * cause of "wrong memory was mapped in".
2571 */
2572 BUG_ON(err_ptr == NULL);
2573 return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
2574}
2575EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
2576
2577
2604/* Returns: 0 success */ 2578/* Returns: 0 success */
2605int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, 2579int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2606 int numpgs, struct page **pages) 2580 int numpgs, struct page **pages)
@@ -2609,22 +2583,7 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2609 return 0; 2583 return 0;
2610 2584
2611#ifdef CONFIG_XEN_PVH 2585#ifdef CONFIG_XEN_PVH
2612 while (numpgs--) { 2586 return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
2613 /*
2614 * The mmu has already cleaned up the process mmu
2615 * resources at this point (lookup_address will return
2616 * NULL).
2617 */
2618 unsigned long pfn = page_to_pfn(pages[numpgs]);
2619
2620 xlate_remove_from_p2m(pfn, 1);
2621 }
2622 /*
2623 * We don't need to flush tlbs because as part of
2624 * xlate_remove_from_p2m, the hypervisor will do tlb flushes
2625 * after removing the p2m entries from the EPT/NPT
2626 */
2627 return 0;
2628#else 2587#else
2629 return -EINVAL; 2588 return -EINVAL;
2630#endif 2589#endif
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..86484384492e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
90 90
91 set_cpu_online(cpu, true); 91 set_cpu_online(cpu, true);
92 92
93 this_cpu_write(cpu_state, CPU_ONLINE); 93 cpu_set_state_online(cpu); /* Implies full memory barrier. */
94
95 wmb();
96 94
97 /* We can take interrupts now: we're officially "up". */ 95 /* We can take interrupts now: we're officially "up". */
98 local_irq_enable(); 96 local_irq_enable();
99
100 wmb(); /* make sure everything is out */
101} 97}
102 98
103/* 99/*
@@ -445,21 +441,19 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
445{ 441{
446 int rc; 442 int rc;
447 443
448 per_cpu(current_task, cpu) = idle; 444 common_cpu_up(cpu, idle);
449#ifdef CONFIG_X86_32
450 irq_ctx_init(cpu);
451#else
452 clear_tsk_thread_flag(idle, TIF_FORK);
453#endif
454 per_cpu(kernel_stack, cpu) =
455 (unsigned long)task_stack_page(idle) -
456 KERNEL_STACK_OFFSET + THREAD_SIZE;
457 445
458 xen_setup_runstate_info(cpu); 446 xen_setup_runstate_info(cpu);
459 xen_setup_timer(cpu); 447 xen_setup_timer(cpu);
460 xen_init_lock_cpu(cpu); 448 xen_init_lock_cpu(cpu);
461 449
462 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 450 /*
451 * PV VCPUs are always successfully taken down (see 'while' loop
452 * in xen_cpu_die()), so -EBUSY is an error.
453 */
454 rc = cpu_check_up_prepare(cpu);
455 if (rc)
456 return rc;
463 457
464 /* make sure interrupts start blocked */ 458 /* make sure interrupts start blocked */
465 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 459 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -468,10 +462,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
468 if (rc) 462 if (rc)
469 return rc; 463 return rc;
470 464
471 if (num_online_cpus() == 1)
472 /* Just in case we booted with a single CPU. */
473 alternatives_enable_smp();
474
475 rc = xen_smp_intr_init(cpu); 465 rc = xen_smp_intr_init(cpu);
476 if (rc) 466 if (rc)
477 return rc; 467 return rc;
@@ -479,10 +469,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
479 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 469 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
480 BUG_ON(rc); 470 BUG_ON(rc);
481 471
482 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { 472 while (cpu_report_state(cpu) != CPU_ONLINE)
483 HYPERVISOR_sched_op(SCHEDOP_yield, NULL); 473 HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
484 barrier();
485 }
486 474
487 return 0; 475 return 0;
488} 476}
@@ -511,11 +499,11 @@ static void xen_cpu_die(unsigned int cpu)
511 schedule_timeout(HZ/10); 499 schedule_timeout(HZ/10);
512 } 500 }
513 501
514 cpu_die_common(cpu); 502 if (common_cpu_die(cpu) == 0) {
515 503 xen_smp_intr_free(cpu);
516 xen_smp_intr_free(cpu); 504 xen_uninit_lock_cpu(cpu);
517 xen_uninit_lock_cpu(cpu); 505 xen_teardown_timer(cpu);
518 xen_teardown_timer(cpu); 506 }
519} 507}
520 508
521static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ 509static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -747,6 +735,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
747static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) 735static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
748{ 736{
749 int rc; 737 int rc;
738
739 /*
740 * This can happen if CPU was offlined earlier and
741 * offlining timed out in common_cpu_die().
742 */
743 if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
744 xen_smp_intr_free(cpu);
745 xen_uninit_lock_cpu(cpu);
746 }
747
750 /* 748 /*
751 * xen_smp_intr_init() needs to run before native_cpu_up() 749 * xen_smp_intr_init() needs to run before native_cpu_up()
752 * so that IPI vectors are set up on the booting CPU before 750 * so that IPI vectors are set up on the booting CPU before
@@ -768,12 +766,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
768 return rc; 766 return rc;
769} 767}
770 768
771static void xen_hvm_cpu_die(unsigned int cpu)
772{
773 xen_cpu_die(cpu);
774 native_cpu_die(cpu);
775}
776
777void __init xen_hvm_smp_init(void) 769void __init xen_hvm_smp_init(void)
778{ 770{
779 if (!xen_have_vector_callback) 771 if (!xen_have_vector_callback)
@@ -781,7 +773,7 @@ void __init xen_hvm_smp_init(void)
781 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; 773 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
782 smp_ops.smp_send_reschedule = xen_smp_send_reschedule; 774 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
783 smp_ops.cpu_up = xen_hvm_cpu_up; 775 smp_ops.cpu_up = xen_hvm_cpu_up;
784 smp_ops.cpu_die = xen_hvm_cpu_die; 776 smp_ops.cpu_die = xen_cpu_die;
785 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; 777 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
786 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; 778 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
787 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; 779 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index c4df9dbd63b7..d9497698645a 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,5 +1,5 @@
1#include <linux/types.h> 1#include <linux/types.h>
2#include <linux/clockchips.h> 2#include <linux/tick.h>
3 3
4#include <xen/interface/xen.h> 4#include <xen/interface/xen.h>
5#include <xen/grant_table.h> 5#include <xen/grant_table.h>
@@ -81,17 +81,14 @@ void xen_arch_post_suspend(int cancelled)
81 81
82static void xen_vcpu_notify_restore(void *data) 82static void xen_vcpu_notify_restore(void *data)
83{ 83{
84 unsigned long reason = (unsigned long)data;
85
86 /* Boot processor notified via generic timekeeping_resume() */ 84 /* Boot processor notified via generic timekeeping_resume() */
87 if ( smp_processor_id() == 0) 85 if (smp_processor_id() == 0)
88 return; 86 return;
89 87
90 clockevents_notify(reason, NULL); 88 tick_resume_local();
91} 89}
92 90
93void xen_arch_resume(void) 91void xen_arch_resume(void)
94{ 92{
95 on_each_cpu(xen_vcpu_notify_restore, 93 on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
96 (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
97} 94}
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
index 520022d1a181..a702ec2f5931 100644
--- a/arch/x86/xen/trace.c
+++ b/arch/x86/xen/trace.c
@@ -1,54 +1,12 @@
1#include <linux/ftrace.h> 1#include <linux/ftrace.h>
2#include <xen/interface/xen.h> 2#include <xen/interface/xen.h>
3#include <xen/interface/xen-mca.h>
3 4
4#define N(x) [__HYPERVISOR_##x] = "("#x")" 5#define HYPERCALL(x) [__HYPERVISOR_##x] = "("#x")",
5static const char *xen_hypercall_names[] = { 6static const char *xen_hypercall_names[] = {
6 N(set_trap_table), 7#include <asm/xen-hypercalls.h>
7 N(mmu_update),
8 N(set_gdt),
9 N(stack_switch),
10 N(set_callbacks),
11 N(fpu_taskswitch),
12 N(sched_op_compat),
13 N(dom0_op),
14 N(set_debugreg),
15 N(get_debugreg),
16 N(update_descriptor),
17 N(memory_op),
18 N(multicall),
19 N(update_va_mapping),
20 N(set_timer_op),
21 N(event_channel_op_compat),
22 N(xen_version),
23 N(console_io),
24 N(physdev_op_compat),
25 N(grant_table_op),
26 N(vm_assist),
27 N(update_va_mapping_otherdomain),
28 N(iret),
29 N(vcpu_op),
30 N(set_segment_base),
31 N(mmuext_op),
32 N(acm_op),
33 N(nmi_op),
34 N(sched_op),
35 N(callback_op),
36 N(xenoprof_op),
37 N(event_channel_op),
38 N(physdev_op),
39 N(hvm_op),
40
41/* Architecture-specific hypercall definitions. */
42 N(arch_0),
43 N(arch_1),
44 N(arch_2),
45 N(arch_3),
46 N(arch_4),
47 N(arch_5),
48 N(arch_6),
49 N(arch_7),
50}; 8};
51#undef N 9#undef HYPERCALL
52 10
53static const char *xen_hypercall_name(unsigned op) 11static const char *xen_hypercall_name(unsigned op)
54{ 12{
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefda4275..985fc3ee0973 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64)
68 * We're already on the usermode stack at this point, but 68 * We're already on the usermode stack at this point, but
69 * still with the kernel gs, so we can easily switch back 69 * still with the kernel gs, so we can easily switch back
70 */ 70 */
71 movq %rsp, PER_CPU_VAR(old_rsp) 71 movq %rsp, PER_CPU_VAR(rsp_scratch)
72 movq PER_CPU_VAR(kernel_stack), %rsp 72 movq PER_CPU_VAR(kernel_stack), %rsp
73 73
74 pushq $__USER_DS 74 pushq $__USER_DS
75 pushq PER_CPU_VAR(old_rsp) 75 pushq PER_CPU_VAR(rsp_scratch)
76 pushq %r11 76 pushq %r11
77 pushq $__USER_CS 77 pushq $__USER_CS
78 pushq %rcx 78 pushq %rcx
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32)
87 * We're already on the usermode stack at this point, but 87 * We're already on the usermode stack at this point, but
88 * still with the kernel gs, so we can easily switch back 88 * still with the kernel gs, so we can easily switch back
89 */ 89 */
90 movq %rsp, PER_CPU_VAR(old_rsp) 90 movq %rsp, PER_CPU_VAR(rsp_scratch)
91 movq PER_CPU_VAR(kernel_stack), %rsp 91 movq PER_CPU_VAR(kernel_stack), %rsp
92 92
93 pushq $__USER32_DS 93 pushq $__USER32_DS
94 pushq PER_CPU_VAR(old_rsp) 94 pushq PER_CPU_VAR(rsp_scratch)
95 pushq %r11 95 pushq %r11
96 pushq $__USER32_CS 96 pushq $__USER32_CS
97 pushq %rcx 97 pushq %rcx
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 674b222544b7..8afdfccf6086 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -12,6 +12,8 @@
12 12
13#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <xen/interface/features.h> 14#include <xen/interface/features.h>
15#include <xen/interface/xen.h>
16#include <xen/interface/xen-mca.h>
15#include <asm/xen/interface.h> 17#include <asm/xen/interface.h>
16 18
17#ifdef CONFIG_XEN_PVH 19#ifdef CONFIG_XEN_PVH
@@ -85,59 +87,14 @@ ENTRY(xen_pvh_early_cpu_init)
85.pushsection .text 87.pushsection .text
86 .balign PAGE_SIZE 88 .balign PAGE_SIZE
87ENTRY(hypercall_page) 89ENTRY(hypercall_page)
88#define NEXT_HYPERCALL(x) \ 90 .skip PAGE_SIZE
89 ENTRY(xen_hypercall_##x) \ 91
90 .skip 32 92#define HYPERCALL(n) \
91 93 .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
92NEXT_HYPERCALL(set_trap_table) 94 .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
93NEXT_HYPERCALL(mmu_update) 95#include <asm/xen-hypercalls.h>
94NEXT_HYPERCALL(set_gdt) 96#undef HYPERCALL
95NEXT_HYPERCALL(stack_switch) 97
96NEXT_HYPERCALL(set_callbacks)
97NEXT_HYPERCALL(fpu_taskswitch)
98NEXT_HYPERCALL(sched_op_compat)
99NEXT_HYPERCALL(platform_op)
100NEXT_HYPERCALL(set_debugreg)
101NEXT_HYPERCALL(get_debugreg)
102NEXT_HYPERCALL(update_descriptor)
103NEXT_HYPERCALL(ni)
104NEXT_HYPERCALL(memory_op)
105NEXT_HYPERCALL(multicall)
106NEXT_HYPERCALL(update_va_mapping)
107NEXT_HYPERCALL(set_timer_op)
108NEXT_HYPERCALL(event_channel_op_compat)
109NEXT_HYPERCALL(xen_version)
110NEXT_HYPERCALL(console_io)
111NEXT_HYPERCALL(physdev_op_compat)
112NEXT_HYPERCALL(grant_table_op)
113NEXT_HYPERCALL(vm_assist)
114NEXT_HYPERCALL(update_va_mapping_otherdomain)
115NEXT_HYPERCALL(iret)
116NEXT_HYPERCALL(vcpu_op)
117NEXT_HYPERCALL(set_segment_base)
118NEXT_HYPERCALL(mmuext_op)
119NEXT_HYPERCALL(xsm_op)
120NEXT_HYPERCALL(nmi_op)
121NEXT_HYPERCALL(sched_op)
122NEXT_HYPERCALL(callback_op)
123NEXT_HYPERCALL(xenoprof_op)
124NEXT_HYPERCALL(event_channel_op)
125NEXT_HYPERCALL(physdev_op)
126NEXT_HYPERCALL(hvm_op)
127NEXT_HYPERCALL(sysctl)
128NEXT_HYPERCALL(domctl)
129NEXT_HYPERCALL(kexec_op)
130NEXT_HYPERCALL(tmem_op) /* 38 */
131ENTRY(xen_hypercall_rsvr)
132 .skip 320
133NEXT_HYPERCALL(mca) /* 48 */
134NEXT_HYPERCALL(arch_1)
135NEXT_HYPERCALL(arch_2)
136NEXT_HYPERCALL(arch_3)
137NEXT_HYPERCALL(arch_4)
138NEXT_HYPERCALL(arch_5)
139NEXT_HYPERCALL(arch_6)
140 .balign PAGE_SIZE
141.popsection 98.popsection
142 99
143 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 100 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")