aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig68
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/Makefile.um2
-rw-r--r--arch/x86/boot/compressed/aslr.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S3
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/misc.c5
-rw-r--r--arch/x86/boot/compressed/misc.h6
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/video-mode.c4
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/boot/video.h1
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c187
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c15
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c15
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c9
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c15
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c7
-rw-r--r--arch/x86/crypto/glue_helper.c1
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c15
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c15
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c15
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c9
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c139
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S10
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S10
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S10
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c193
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S6
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S6
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S6
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c202
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S4
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c15
-rw-r--r--arch/x86/ia32/Makefile1
-rw-r--r--arch/x86/ia32/ia32_signal.c19
-rw-r--r--arch/x86/ia32/ia32entry.S485
-rw-r--r--arch/x86/ia32/nosyscall.c7
-rw-r--r--arch/x86/ia32/sys_ia32.c14
-rw-r--r--arch/x86/ia32/syscall_ia32.c25
-rw-r--r--arch/x86/include/asm/alternative-asm.h53
-rw-r--r--arch/x86/include/asm/alternative.h73
-rw-r--r--arch/x86/include/asm/apic.h3
-rw-r--r--arch/x86/include/asm/barrier.h6
-rw-r--r--arch/x86/include/asm/calling.h284
-rw-r--r--arch/x86/include/asm/compat.h2
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h32
-rw-r--r--arch/x86/include/asm/desc.h7
-rw-r--r--arch/x86/include/asm/dwarf2.h24
-rw-r--r--arch/x86/include/asm/e820.h8
-rw-r--r--arch/x86/include/asm/efi.h6
-rw-r--r--arch/x86/include/asm/elf.h11
-rw-r--r--arch/x86/include/asm/fpu-internal.h130
-rw-r--r--arch/x86/include/asm/hw_irq.h5
-rw-r--r--arch/x86/include/asm/insn.h2
-rw-r--r--arch/x86/include/asm/iommu_table.h11
-rw-r--r--arch/x86/include/asm/irqflags.h49
-rw-r--r--arch/x86/include/asm/jump_label.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h28
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/livepatch.h4
-rw-r--r--arch/x86/include/asm/mce.h16
-rw-r--r--arch/x86/include/asm/microcode.h73
-rw-r--r--arch/x86/include/asm/microcode_intel.h13
-rw-r--r--arch/x86/include/asm/mwait.h8
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h13
-rw-r--r--arch/x86/include/asm/paravirt_types.h8
-rw-r--r--arch/x86/include/asm/pgalloc.h8
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h8
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h1
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/pm-trace.h (renamed from arch/x86/include/asm/resume-trace.h)10
-rw-r--r--arch/x86/include/asm/processor.h107
-rw-r--r--arch/x86/include/asm/ptrace.h45
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/seccomp.h21
-rw-r--r--arch/x86/include/asm/seccomp_32.h11
-rw-r--r--arch/x86/include/asm/seccomp_64.h17
-rw-r--r--arch/x86/include/asm/segment.h289
-rw-r--r--arch/x86/include/asm/setup.h5
-rw-r--r--arch/x86/include/asm/sigcontext.h6
-rw-r--r--arch/x86/include/asm/sighandling.h4
-rw-r--r--arch/x86/include/asm/smap.h30
-rw-r--r--arch/x86/include/asm/smp.h3
-rw-r--r--arch/x86/include/asm/special_insns.h24
-rw-r--r--arch/x86/include/asm/thread_info.h77
-rw-r--r--arch/x86/include/asm/uaccess_64.h2
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h1
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h16
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h13
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h21
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/alternative.c163
-rw-r--r--arch/x86/kernel/apic/apic.c62
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c89
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c87
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c715
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c154
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c63
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c75
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c345
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c22
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event.c18
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/devicetree.c4
-rw-r--r--arch/x86/kernel/dumpstack.c15
-rw-r--r--arch/x86/kernel/dumpstack_32.c13
-rw-r--r--arch/x86/kernel/dumpstack_64.c11
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early_printk.c32
-rw-r--r--arch/x86/kernel/entry_32.S93
-rw-r--r--arch/x86/kernel/entry_64.S964
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S3
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/i387.c56
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kgdb.c6
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/perf_regs.c40
-rw-r--r--arch/x86/kernel/process.c106
-rw-r--r--arch/x86/kernel/process_32.c27
-rw-r--r--arch/x86/kernel/process_64.c24
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kernel/reboot.c10
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S8
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S16
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/signal.c68
-rw-r--r--arch/x86/kernel/smpboot.c77
-rw-r--r--arch/x86/kernel/sys_x86_64.c30
-rw-r--r--arch/x86/kernel/syscall_32.c16
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c62
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/xsave.c39
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c33
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c193
-rw-r--r--arch/x86/kvm/i8254.c14
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c12
-rw-r--r--arch/x86/kvm/ioapic.c22
-rw-r--r--arch/x86/kvm/ioapic.h11
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c147
-rw-r--r--arch/x86/kvm/lapic.h17
-rw-r--r--arch/x86/kvm/mmu.c73
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c43
-rw-r--r--arch/x86/kvm/vmx.c146
-rw-r--r--arch/x86/kvm/x86.c157
-rw-r--r--arch/x86/lguest/boot.c4
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S50
-rw-r--r--arch/x86/lib/checksum_32.S64
-rw-r--r--arch/x86/lib/clear_page_64.S66
-rw-r--r--arch/x86/lib/copy_page_64.S37
-rw-r--r--arch/x86/lib/copy_user_64.S46
-rw-r--r--arch/x86/lib/csum-copy_64.S2
-rw-r--r--arch/x86/lib/insn.c13
-rw-r--r--arch/x86/lib/memcpy_64.S68
-rw-r--r--arch/x86/lib/memmove_64.S19
-rw-r--r--arch/x86/lib/memset_64.S61
-rw-r--r--arch/x86/lib/msr-reg.S24
-rw-r--r--arch/x86/lib/rwsem.S44
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S28
-rw-r--r--arch/x86/lib/usercopy_64.c15
-rw-r--r--arch/x86/lib/x86-opcode-map.txt9
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c8
-rw-r--r--arch/x86/mm/init.c69
-rw-r--r--arch/x86/mm/init_64.c14
-rw-r--r--arch/x86/mm/ioremap.c23
-rw-r--r--arch/x86/mm/memtest.c118
-rw-r--r--arch/x86/mm/mmap.c38
-rw-r--r--arch/x86/mm/numa.c11
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/pat.c6
-rw-r--r--arch/x86/mm/pgtable.c160
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c4
-rw-r--r--arch/x86/platform/efi/efi.c17
-rw-r--r--arch/x86/platform/efi/efi_32.c22
-rw-r--r--arch/x86/platform/efi/efi_64.c29
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c10
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c4
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c4
-rw-r--r--arch/x86/platform/uv/tlb_uv.c6
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/syscalls/Makefile9
-rw-r--r--arch/x86/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/um/Makefile1
-rw-r--r--arch/x86/um/asm/barrier.h15
-rw-r--r--arch/x86/um/asm/elf.h2
-rw-r--r--arch/x86/um/ldt.c227
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_32.h3
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_64.h3
-rw-r--r--arch/x86/um/shared/sysdep/skas_ptrace.h22
-rw-r--r--arch/x86/um/signal.c7
-rw-r--r--arch/x86/um/sys_call_table_64.c2
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/vdso/vclock_gettime.c34
-rw-r--r--arch/x86/vdso/vdso32/syscall.S2
-rw-r--r--arch/x86/xen/apic.c180
-rw-r--r--arch/x86/xen/enlighten.c91
-rw-r--r--arch/x86/xen/mmu.c221
-rw-r--r--arch/x86/xen/p2m.c10
-rw-r--r--arch/x86/xen/smp.c60
-rw-r--r--arch/x86/xen/suspend.c11
-rw-r--r--arch/x86/xen/trace.c50
-rw-r--r--arch/x86/xen/xen-asm_64.S8
-rw-r--r--arch/x86/xen/xen-head.S63
246 files changed, 4920 insertions, 5071 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca55187..d43e7e1c784b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -87,7 +87,7 @@ config X86
87 select HAVE_ARCH_KMEMCHECK 87 select HAVE_ARCH_KMEMCHECK
88 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP 88 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
89 select HAVE_USER_RETURN_NOTIFIER 89 select HAVE_USER_RETURN_NOTIFIER
90 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 90 select ARCH_HAS_ELF_RANDOMIZE
91 select HAVE_ARCH_JUMP_LABEL 91 select HAVE_ARCH_JUMP_LABEL
92 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 92 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
93 select SPARSE_IRQ 93 select SPARSE_IRQ
@@ -99,6 +99,7 @@ config X86
99 select IRQ_FORCED_THREADING 99 select IRQ_FORCED_THREADING
100 select HAVE_BPF_JIT if X86_64 100 select HAVE_BPF_JIT if X86_64
101 select HAVE_ARCH_TRANSPARENT_HUGEPAGE 101 select HAVE_ARCH_TRANSPARENT_HUGEPAGE
102 select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
102 select ARCH_HAS_SG_CHAIN 103 select ARCH_HAS_SG_CHAIN
103 select CLKEVT_I8253 104 select CLKEVT_I8253
104 select ARCH_HAVE_NMI_SAFE_CMPXCHG 105 select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -235,12 +236,10 @@ config ARCH_WANT_GENERAL_HUGETLB
235 def_bool y 236 def_bool y
236 237
237config ZONE_DMA32 238config ZONE_DMA32
238 bool 239 def_bool y if X86_64
239 default X86_64
240 240
241config AUDIT_ARCH 241config AUDIT_ARCH
242 bool 242 def_bool y if X86_64
243 default X86_64
244 243
245config ARCH_SUPPORTS_OPTIMIZED_INLINING 244config ARCH_SUPPORTS_OPTIMIZED_INLINING
246 def_bool y 245 def_bool y
@@ -279,6 +278,12 @@ config ARCH_SUPPORTS_UPROBES
279config FIX_EARLYCON_MEM 278config FIX_EARLYCON_MEM
280 def_bool y 279 def_bool y
281 280
281config PGTABLE_LEVELS
282 int
283 default 4 if X86_64
284 default 3 if X86_PAE
285 default 2
286
282source "init/Kconfig" 287source "init/Kconfig"
283source "kernel/Kconfig.freezer" 288source "kernel/Kconfig.freezer"
284 289
@@ -716,17 +721,6 @@ endif #HYPERVISOR_GUEST
716config NO_BOOTMEM 721config NO_BOOTMEM
717 def_bool y 722 def_bool y
718 723
719config MEMTEST
720 bool "Memtest"
721 ---help---
722 This option adds a kernel parameter 'memtest', which allows memtest
723 to be set.
724 memtest=0, mean disabled; -- default
725 memtest=1, mean do 1 test pattern;
726 ...
727 memtest=4, mean do 4 test patterns.
728 If you are unsure how to answer this question, answer N.
729
730source "arch/x86/Kconfig.cpu" 724source "arch/x86/Kconfig.cpu"
731 725
732config HPET_TIMER 726config HPET_TIMER
@@ -891,7 +885,8 @@ config UP_LATE_INIT
891 depends on !SMP && X86_LOCAL_APIC 885 depends on !SMP && X86_LOCAL_APIC
892 886
893config X86_UP_APIC 887config X86_UP_APIC
894 bool "Local APIC support on uniprocessors" 888 bool "Local APIC support on uniprocessors" if !PCI_MSI
889 default PCI_MSI
895 depends on X86_32 && !SMP && !X86_32_NON_STANDARD 890 depends on X86_32 && !SMP && !X86_32_NON_STANDARD
896 ---help--- 891 ---help---
897 A local APIC (Advanced Programmable Interrupt Controller) is an 892 A local APIC (Advanced Programmable Interrupt Controller) is an
@@ -903,10 +898,6 @@ config X86_UP_APIC
903 performance counters), and the NMI watchdog which detects hard 898 performance counters), and the NMI watchdog which detects hard
904 lockups. 899 lockups.
905 900
906config X86_UP_APIC_MSI
907 def_bool y
908 select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI
909
910config X86_UP_IOAPIC 901config X86_UP_IOAPIC
911 bool "IO-APIC support on uniprocessors" 902 bool "IO-APIC support on uniprocessors"
912 depends on X86_UP_APIC 903 depends on X86_UP_APIC
@@ -925,8 +916,8 @@ config X86_LOCAL_APIC
925 select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ 916 select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
926 917
927config X86_IO_APIC 918config X86_IO_APIC
928 def_bool X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC 919 def_bool y
929 depends on X86_LOCAL_APIC 920 depends on X86_LOCAL_APIC || X86_UP_IOAPIC
930 select IRQ_DOMAIN 921 select IRQ_DOMAIN
931 922
932config X86_REROUTE_FOR_BROKEN_BOOT_IRQS 923config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -1145,10 +1136,10 @@ config MICROCODE_OLD_INTERFACE
1145 depends on MICROCODE 1136 depends on MICROCODE
1146 1137
1147config MICROCODE_INTEL_EARLY 1138config MICROCODE_INTEL_EARLY
1148 def_bool n 1139 bool
1149 1140
1150config MICROCODE_AMD_EARLY 1141config MICROCODE_AMD_EARLY
1151 def_bool n 1142 bool
1152 1143
1153config MICROCODE_EARLY 1144config MICROCODE_EARLY
1154 bool "Early load microcode" 1145 bool "Early load microcode"
@@ -1300,14 +1291,14 @@ config ARCH_DMA_ADDR_T_64BIT
1300 def_bool y 1291 def_bool y
1301 depends on X86_64 || HIGHMEM64G 1292 depends on X86_64 || HIGHMEM64G
1302 1293
1303config DIRECT_GBPAGES 1294config X86_DIRECT_GBPAGES
1304 bool "Enable 1GB pages for kernel pagetables" if EXPERT 1295 def_bool y
1305 default y 1296 depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
1306 depends on X86_64
1307 ---help--- 1297 ---help---
1308 Allow the kernel linear mapping to use 1GB pages on CPUs that 1298 Certain kernel features effectively disable kernel
1309 support it. This can improve the kernel's performance a tiny bit by 1299 linear 1 GB mappings (even if the CPU otherwise
1310 reducing TLB pressure. If in doubt, say "Y". 1300 supports them), so don't confuse the user by printing
1301 that we have them enabled.
1311 1302
1312# Common NUMA Features 1303# Common NUMA Features
1313config NUMA 1304config NUMA
@@ -1747,14 +1738,11 @@ config KEXEC_VERIFY_SIG
1747 depends on KEXEC_FILE 1738 depends on KEXEC_FILE
1748 ---help--- 1739 ---help---
1749 This option makes kernel signature verification mandatory for 1740 This option makes kernel signature verification mandatory for
1750 kexec_file_load() syscall. If kernel is signature can not be 1741 the kexec_file_load() syscall.
1751 verified, kexec_file_load() will fail. 1742
1752 1743 In addition to that option, you need to enable signature
1753 This option enforces signature verification at generic level. 1744 verification for the corresponding kernel image type being
1754 One needs to enable signature verification for type of kernel 1745 loaded in order for this to work.
1755 image being loaded to make sure it works. For example, enable
1756 bzImage signature verification option to be able to load and
1757 verify signatures of bzImage. Otherwise kernel loading will fail.
1758 1746
1759config KEXEC_BZIMAGE_VERIFY_SIG 1747config KEXEC_BZIMAGE_VERIFY_SIG
1760 bool "Enable bzImage signature verification support" 1748 bool "Enable bzImage signature verification support"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5ba2d9ce82dc..2fda005bb334 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -63,7 +63,7 @@ ifeq ($(CONFIG_X86_32),y)
63 $(call cc-option,-fno-unit-at-a-time)) 63 $(call cc-option,-fno-unit-at-a-time))
64 64
65 # CPU-specific tuning. Anything which can be shared with UML should go here. 65 # CPU-specific tuning. Anything which can be shared with UML should go here.
66 include $(srctree)/arch/x86/Makefile_32.cpu 66 include arch/x86/Makefile_32.cpu
67 KBUILD_CFLAGS += $(cflags-y) 67 KBUILD_CFLAGS += $(cflags-y)
68 68
69 # temporary until string.h is fixed 69 # temporary until string.h is fixed
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 95eba554baf9..5b7e898ffd9a 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -18,7 +18,7 @@ LDS_EXTRA := -Ui386
18export LDS_EXTRA 18export LDS_EXTRA
19 19
20# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. 20# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
21include $(srctree)/arch/x86/Makefile_32.cpu 21include arch/x86/Makefile_32.cpu
22 22
23# prevent gcc from keeping the stack 16 byte aligned. Taken from i386. 23# prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
24cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) 24cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index bb1376381985..d7b1f655b3ef 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum,
295 return slots_fetch_random(); 295 return slots_fetch_random();
296} 296}
297 297
298unsigned char *choose_kernel_location(unsigned char *input, 298unsigned char *choose_kernel_location(struct boot_params *boot_params,
299 unsigned char *input,
299 unsigned long input_size, 300 unsigned long input_size,
300 unsigned char *output, 301 unsigned char *output,
301 unsigned long output_size) 302 unsigned long output_size)
@@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
315 } 316 }
316#endif 317#endif
317 318
319 boot_params->hdr.loadflags |= KASLR_FLAG;
320
318 /* Record the various known unsafe memory ranges. */ 321 /* Record the various known unsafe memory ranges. */
319 mem_avoid_init((unsigned long)input, input_size, 322 mem_avoid_init((unsigned long)input, input_size,
320 (unsigned long)output, output_size); 323 (unsigned long)output, output_size);
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1d7fbbcc196d..8ef964ddc18e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -29,6 +29,7 @@
29#include <asm/page_types.h> 29#include <asm/page_types.h>
30#include <asm/boot.h> 30#include <asm/boot.h>
31#include <asm/asm-offsets.h> 31#include <asm/asm-offsets.h>
32#include <asm/bootparam.h>
32 33
33 __HEAD 34 __HEAD
34ENTRY(startup_32) 35ENTRY(startup_32)
@@ -102,7 +103,7 @@ preferred_addr:
102 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 103 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
103 * us to not reload segments 104 * us to not reload segments
104 */ 105 */
105 testb $(1<<6), BP_loadflags(%esi) 106 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
106 jnz 1f 107 jnz 1f
107 108
108 cli 109 cli
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6b1766c6c082..b0c0d16ef58d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -31,6 +31,7 @@
31#include <asm/msr.h> 31#include <asm/msr.h>
32#include <asm/processor-flags.h> 32#include <asm/processor-flags.h>
33#include <asm/asm-offsets.h> 33#include <asm/asm-offsets.h>
34#include <asm/bootparam.h>
34 35
35 __HEAD 36 __HEAD
36 .code32 37 .code32
@@ -46,7 +47,7 @@ ENTRY(startup_32)
46 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 47 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
47 * us to not reload segments 48 * us to not reload segments
48 */ 49 */
49 testb $(1<<6), BP_loadflags(%esi) 50 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
50 jnz 1f 51 jnz 1f
51 52
52 cli 53 cli
@@ -164,7 +165,7 @@ ENTRY(startup_32)
164 /* After gdt is loaded */ 165 /* After gdt is loaded */
165 xorl %eax, %eax 166 xorl %eax, %eax
166 lldt %ax 167 lldt %ax
167 movl $0x20, %eax 168 movl $__BOOT_TSS, %eax
168 ltr %ax 169 ltr %ax
169 170
170 /* 171 /*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a950864a64da..a107b935e22f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
377 377
378 real_mode = rmode; 378 real_mode = rmode;
379 379
380 /* Clear it for solely in-kernel use */
381 real_mode->hdr.loadflags &= ~KASLR_FLAG;
382
380 sanitize_boot_params(real_mode); 383 sanitize_boot_params(real_mode);
381 384
382 if (real_mode->screen_info.orig_video_mode == 7) { 385 if (real_mode->screen_info.orig_video_mode == 7) {
@@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
401 * the entire decompressed kernel plus relocation table, or the 404 * the entire decompressed kernel plus relocation table, or the
402 * entire decompressed kernel plus .bss and .brk sections. 405 * entire decompressed kernel plus .bss and .brk sections.
403 */ 406 */
404 output = choose_kernel_location(input_data, input_len, output, 407 output = choose_kernel_location(real_mode, input_data, input_len, output,
405 output_len > run_size ? output_len 408 output_len > run_size ? output_len
406 : run_size); 409 : run_size);
407 410
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 04477d68403f..89dd0d78013a 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option);
57 57
58#if CONFIG_RANDOMIZE_BASE 58#if CONFIG_RANDOMIZE_BASE
59/* aslr.c */ 59/* aslr.c */
60unsigned char *choose_kernel_location(unsigned char *input, 60unsigned char *choose_kernel_location(struct boot_params *boot_params,
61 unsigned char *input,
61 unsigned long input_size, 62 unsigned long input_size,
62 unsigned char *output, 63 unsigned char *output,
63 unsigned long output_size); 64 unsigned long output_size);
@@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
65bool has_cpuflag(int flag); 66bool has_cpuflag(int flag);
66#else 67#else
67static inline 68static inline
68unsigned char *choose_kernel_location(unsigned char *input, 69unsigned char *choose_kernel_location(struct boot_params *boot_params,
70 unsigned char *input,
69 unsigned long input_size, 71 unsigned long input_size,
70 unsigned char *output, 72 unsigned char *output,
71 unsigned long output_size) 73 unsigned long output_size)
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 493f3fd9f139..318b8465d302 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,7 +30,7 @@ int strcmp(const char *str1, const char *str2)
30 int delta = 0; 30 int delta = 0;
31 31
32 while (*s1 || *s2) { 32 while (*s1 || *s2) {
33 delta = *s2 - *s1; 33 delta = *s1 - *s2;
34 if (delta) 34 if (delta)
35 return delta; 35 return delta;
36 s1++; 36 s1++;
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
index 748e8d06290a..aa8a96b052e3 100644
--- a/arch/x86/boot/video-mode.c
+++ b/arch/x86/boot/video-mode.c
@@ -22,10 +22,8 @@
22/* 22/*
23 * Common variables 23 * Common variables
24 */ 24 */
25int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */ 25int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
26u16 video_segment;
27int force_x, force_y; /* Don't query the BIOS for cols/rows */ 26int force_x, force_y; /* Don't query the BIOS for cols/rows */
28
29int do_restore; /* Screen contents changed during mode flip */ 27int do_restore; /* Screen contents changed during mode flip */
30int graphic_mode; /* Graphic mode with linear frame buffer */ 28int graphic_mode; /* Graphic mode with linear frame buffer */
31 29
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 43eda284d27f..05111bb8d018 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -17,6 +17,8 @@
17#include "video.h" 17#include "video.h"
18#include "vesa.h" 18#include "vesa.h"
19 19
20static u16 video_segment;
21
20static void store_cursor_position(void) 22static void store_cursor_position(void)
21{ 23{
22 struct biosregs ireg, oreg; 24 struct biosregs ireg, oreg;
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index 0bb25491262d..b54e0328c449 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -91,7 +91,6 @@ int mode_defined(u16 mode); /* video.c */
91#define ADAPTER_VGA 2 91#define ADAPTER_VGA 2
92 92
93extern int adapter; 93extern int adapter;
94extern u16 video_segment;
95extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ 94extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
96extern int do_restore; /* Restore screen contents */ 95extern int do_restore; /* Restore screen contents */
97extern int graphic_mode; /* Graphics mode with linear frame buffer */ 96extern int graphic_mode; /* Graphics mode with linear frame buffer */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 419819d6dab3..aaa1118bf01e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -248,7 +248,7 @@ CONFIG_USB=y
248CONFIG_USB_ANNOUNCE_NEW_DEVICES=y 248CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
249CONFIG_USB_MON=y 249CONFIG_USB_MON=y
250CONFIG_USB_EHCI_HCD=y 250CONFIG_USB_EHCI_HCD=y
251# CONFIG_USB_EHCI_TT_NEWSCHED is not set 251CONFIG_USB_EHCI_TT_NEWSCHED=y
252CONFIG_USB_OHCI_HCD=y 252CONFIG_USB_OHCI_HCD=y
253CONFIG_USB_UHCI_HCD=y 253CONFIG_USB_UHCI_HCD=y
254CONFIG_USB_PRINTER=y 254CONFIG_USB_PRINTER=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 4c311ddd973b..315b86106572 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -243,7 +243,7 @@ CONFIG_USB=y
243CONFIG_USB_ANNOUNCE_NEW_DEVICES=y 243CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
244CONFIG_USB_MON=y 244CONFIG_USB_MON=y
245CONFIG_USB_EHCI_HCD=y 245CONFIG_USB_EHCI_HCD=y
246# CONFIG_USB_EHCI_TT_NEWSCHED is not set 246CONFIG_USB_EHCI_TT_NEWSCHED=y
247CONFIG_USB_OHCI_HCD=y 247CONFIG_USB_OHCI_HCD=y
248CONFIG_USB_UHCI_HCD=y 248CONFIG_USB_UHCI_HCD=y
249CONFIG_USB_PRINTER=y 249CONFIG_USB_PRINTER=y
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 54f60ab41c63..112cefacf2af 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -797,7 +797,9 @@ static int rfc4106_init(struct crypto_tfm *tfm)
797 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); 797 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
798 struct crypto_aead *cryptd_child; 798 struct crypto_aead *cryptd_child;
799 struct aesni_rfc4106_gcm_ctx *child_ctx; 799 struct aesni_rfc4106_gcm_ctx *child_ctx;
800 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); 800 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
801 CRYPTO_ALG_INTERNAL,
802 CRYPTO_ALG_INTERNAL);
801 if (IS_ERR(cryptd_tfm)) 803 if (IS_ERR(cryptd_tfm))
802 return PTR_ERR(cryptd_tfm); 804 return PTR_ERR(cryptd_tfm);
803 805
@@ -890,15 +892,12 @@ out_free_ablkcipher:
890 return ret; 892 return ret;
891} 893}
892 894
893static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, 895static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
894 unsigned int key_len) 896 unsigned int key_len)
895{ 897{
896 int ret = 0; 898 int ret = 0;
897 struct crypto_tfm *tfm = crypto_aead_tfm(parent); 899 struct crypto_tfm *tfm = crypto_aead_tfm(aead);
898 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); 900 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
899 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
900 struct aesni_rfc4106_gcm_ctx *child_ctx =
901 aesni_rfc4106_gcm_ctx_get(cryptd_child);
902 u8 *new_key_align, *new_key_mem = NULL; 901 u8 *new_key_align, *new_key_mem = NULL;
903 902
904 if (key_len < 4) { 903 if (key_len < 4) {
@@ -943,20 +942,31 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
943 goto exit; 942 goto exit;
944 } 943 }
945 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); 944 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
946 memcpy(child_ctx, ctx, sizeof(*ctx));
947exit: 945exit:
948 kfree(new_key_mem); 946 kfree(new_key_mem);
949 return ret; 947 return ret;
950} 948}
951 949
952/* This is the Integrity Check Value (aka the authentication tag length and can 950static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
953 * be 8, 12 or 16 bytes long. */ 951 unsigned int key_len)
954static int rfc4106_set_authsize(struct crypto_aead *parent,
955 unsigned int authsize)
956{ 952{
957 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); 953 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
958 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); 954 struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
955 struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child);
956 struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm;
957 int ret;
959 958
959 ret = crypto_aead_setkey(child, key, key_len);
960 if (!ret) {
961 memcpy(ctx, c_ctx, sizeof(*ctx));
962 ctx->cryptd_tfm = cryptd_tfm;
963 }
964 return ret;
965}
966
967static int common_rfc4106_set_authsize(struct crypto_aead *aead,
968 unsigned int authsize)
969{
960 switch (authsize) { 970 switch (authsize) {
961 case 8: 971 case 8:
962 case 12: 972 case 12:
@@ -965,51 +975,23 @@ static int rfc4106_set_authsize(struct crypto_aead *parent,
965 default: 975 default:
966 return -EINVAL; 976 return -EINVAL;
967 } 977 }
968 crypto_aead_crt(parent)->authsize = authsize; 978 crypto_aead_crt(aead)->authsize = authsize;
969 crypto_aead_crt(cryptd_child)->authsize = authsize;
970 return 0; 979 return 0;
971} 980}
972 981
973static int rfc4106_encrypt(struct aead_request *req) 982/* This is the Integrity Check Value (aka the authentication tag length and can
974{ 983 * be 8, 12 or 16 bytes long. */
975 int ret; 984static int rfc4106_set_authsize(struct crypto_aead *parent,
976 struct crypto_aead *tfm = crypto_aead_reqtfm(req); 985 unsigned int authsize)
977 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
978
979 if (!irq_fpu_usable()) {
980 struct aead_request *cryptd_req =
981 (struct aead_request *) aead_request_ctx(req);
982 memcpy(cryptd_req, req, sizeof(*req));
983 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
984 return crypto_aead_encrypt(cryptd_req);
985 } else {
986 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
987 kernel_fpu_begin();
988 ret = cryptd_child->base.crt_aead.encrypt(req);
989 kernel_fpu_end();
990 return ret;
991 }
992}
993
994static int rfc4106_decrypt(struct aead_request *req)
995{ 986{
987 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
988 struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
996 int ret; 989 int ret;
997 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
998 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
999 990
1000 if (!irq_fpu_usable()) { 991 ret = crypto_aead_setauthsize(child, authsize);
1001 struct aead_request *cryptd_req = 992 if (!ret)
1002 (struct aead_request *) aead_request_ctx(req); 993 crypto_aead_crt(parent)->authsize = authsize;
1003 memcpy(cryptd_req, req, sizeof(*req)); 994 return ret;
1004 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1005 return crypto_aead_decrypt(cryptd_req);
1006 } else {
1007 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1008 kernel_fpu_begin();
1009 ret = cryptd_child->base.crt_aead.decrypt(req);
1010 kernel_fpu_end();
1011 return ret;
1012 }
1013} 995}
1014 996
1015static int __driver_rfc4106_encrypt(struct aead_request *req) 997static int __driver_rfc4106_encrypt(struct aead_request *req)
@@ -1185,6 +1167,78 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
1185 } 1167 }
1186 return retval; 1168 return retval;
1187} 1169}
1170
1171static int rfc4106_encrypt(struct aead_request *req)
1172{
1173 int ret;
1174 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1175 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1176
1177 if (!irq_fpu_usable()) {
1178 struct aead_request *cryptd_req =
1179 (struct aead_request *) aead_request_ctx(req);
1180
1181 memcpy(cryptd_req, req, sizeof(*req));
1182 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1183 ret = crypto_aead_encrypt(cryptd_req);
1184 } else {
1185 kernel_fpu_begin();
1186 ret = __driver_rfc4106_encrypt(req);
1187 kernel_fpu_end();
1188 }
1189 return ret;
1190}
1191
1192static int rfc4106_decrypt(struct aead_request *req)
1193{
1194 int ret;
1195 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1196 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1197
1198 if (!irq_fpu_usable()) {
1199 struct aead_request *cryptd_req =
1200 (struct aead_request *) aead_request_ctx(req);
1201
1202 memcpy(cryptd_req, req, sizeof(*req));
1203 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1204 ret = crypto_aead_decrypt(cryptd_req);
1205 } else {
1206 kernel_fpu_begin();
1207 ret = __driver_rfc4106_decrypt(req);
1208 kernel_fpu_end();
1209 }
1210 return ret;
1211}
1212
1213static int helper_rfc4106_encrypt(struct aead_request *req)
1214{
1215 int ret;
1216
1217 if (unlikely(!irq_fpu_usable())) {
1218 WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
1219 ret = -EINVAL;
1220 } else {
1221 kernel_fpu_begin();
1222 ret = __driver_rfc4106_encrypt(req);
1223 kernel_fpu_end();
1224 }
1225 return ret;
1226}
1227
1228static int helper_rfc4106_decrypt(struct aead_request *req)
1229{
1230 int ret;
1231
1232 if (unlikely(!irq_fpu_usable())) {
1233 WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
1234 ret = -EINVAL;
1235 } else {
1236 kernel_fpu_begin();
1237 ret = __driver_rfc4106_decrypt(req);
1238 kernel_fpu_end();
1239 }
1240 return ret;
1241}
1188#endif 1242#endif
1189 1243
1190static struct crypto_alg aesni_algs[] = { { 1244static struct crypto_alg aesni_algs[] = { {
@@ -1210,7 +1264,7 @@ static struct crypto_alg aesni_algs[] = { {
1210 .cra_name = "__aes-aesni", 1264 .cra_name = "__aes-aesni",
1211 .cra_driver_name = "__driver-aes-aesni", 1265 .cra_driver_name = "__driver-aes-aesni",
1212 .cra_priority = 0, 1266 .cra_priority = 0,
1213 .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 1267 .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
1214 .cra_blocksize = AES_BLOCK_SIZE, 1268 .cra_blocksize = AES_BLOCK_SIZE,
1215 .cra_ctxsize = sizeof(struct crypto_aes_ctx) + 1269 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1216 AESNI_ALIGN - 1, 1270 AESNI_ALIGN - 1,
@@ -1229,7 +1283,8 @@ static struct crypto_alg aesni_algs[] = { {
1229 .cra_name = "__ecb-aes-aesni", 1283 .cra_name = "__ecb-aes-aesni",
1230 .cra_driver_name = "__driver-ecb-aes-aesni", 1284 .cra_driver_name = "__driver-ecb-aes-aesni",
1231 .cra_priority = 0, 1285 .cra_priority = 0,
1232 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1286 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1287 CRYPTO_ALG_INTERNAL,
1233 .cra_blocksize = AES_BLOCK_SIZE, 1288 .cra_blocksize = AES_BLOCK_SIZE,
1234 .cra_ctxsize = sizeof(struct crypto_aes_ctx) + 1289 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1235 AESNI_ALIGN - 1, 1290 AESNI_ALIGN - 1,
@@ -1249,7 +1304,8 @@ static struct crypto_alg aesni_algs[] = { {
1249 .cra_name = "__cbc-aes-aesni", 1304 .cra_name = "__cbc-aes-aesni",
1250 .cra_driver_name = "__driver-cbc-aes-aesni", 1305 .cra_driver_name = "__driver-cbc-aes-aesni",
1251 .cra_priority = 0, 1306 .cra_priority = 0,
1252 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1307 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1308 CRYPTO_ALG_INTERNAL,
1253 .cra_blocksize = AES_BLOCK_SIZE, 1309 .cra_blocksize = AES_BLOCK_SIZE,
1254 .cra_ctxsize = sizeof(struct crypto_aes_ctx) + 1310 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1255 AESNI_ALIGN - 1, 1311 AESNI_ALIGN - 1,
@@ -1313,7 +1369,8 @@ static struct crypto_alg aesni_algs[] = { {
1313 .cra_name = "__ctr-aes-aesni", 1369 .cra_name = "__ctr-aes-aesni",
1314 .cra_driver_name = "__driver-ctr-aes-aesni", 1370 .cra_driver_name = "__driver-ctr-aes-aesni",
1315 .cra_priority = 0, 1371 .cra_priority = 0,
1316 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1372 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1373 CRYPTO_ALG_INTERNAL,
1317 .cra_blocksize = 1, 1374 .cra_blocksize = 1,
1318 .cra_ctxsize = sizeof(struct crypto_aes_ctx) + 1375 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1319 AESNI_ALIGN - 1, 1376 AESNI_ALIGN - 1,
@@ -1357,7 +1414,7 @@ static struct crypto_alg aesni_algs[] = { {
1357 .cra_name = "__gcm-aes-aesni", 1414 .cra_name = "__gcm-aes-aesni",
1358 .cra_driver_name = "__driver-gcm-aes-aesni", 1415 .cra_driver_name = "__driver-gcm-aes-aesni",
1359 .cra_priority = 0, 1416 .cra_priority = 0,
1360 .cra_flags = CRYPTO_ALG_TYPE_AEAD, 1417 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL,
1361 .cra_blocksize = 1, 1418 .cra_blocksize = 1,
1362 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + 1419 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) +
1363 AESNI_ALIGN, 1420 AESNI_ALIGN,
@@ -1366,8 +1423,12 @@ static struct crypto_alg aesni_algs[] = { {
1366 .cra_module = THIS_MODULE, 1423 .cra_module = THIS_MODULE,
1367 .cra_u = { 1424 .cra_u = {
1368 .aead = { 1425 .aead = {
1369 .encrypt = __driver_rfc4106_encrypt, 1426 .setkey = common_rfc4106_set_key,
1370 .decrypt = __driver_rfc4106_decrypt, 1427 .setauthsize = common_rfc4106_set_authsize,
1428 .encrypt = helper_rfc4106_encrypt,
1429 .decrypt = helper_rfc4106_decrypt,
1430 .ivsize = 8,
1431 .maxauthsize = 16,
1371 }, 1432 },
1372 }, 1433 },
1373}, { 1434}, {
@@ -1423,7 +1484,8 @@ static struct crypto_alg aesni_algs[] = { {
1423 .cra_name = "__lrw-aes-aesni", 1484 .cra_name = "__lrw-aes-aesni",
1424 .cra_driver_name = "__driver-lrw-aes-aesni", 1485 .cra_driver_name = "__driver-lrw-aes-aesni",
1425 .cra_priority = 0, 1486 .cra_priority = 0,
1426 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1487 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1488 CRYPTO_ALG_INTERNAL,
1427 .cra_blocksize = AES_BLOCK_SIZE, 1489 .cra_blocksize = AES_BLOCK_SIZE,
1428 .cra_ctxsize = sizeof(struct aesni_lrw_ctx), 1490 .cra_ctxsize = sizeof(struct aesni_lrw_ctx),
1429 .cra_alignmask = 0, 1491 .cra_alignmask = 0,
@@ -1444,7 +1506,8 @@ static struct crypto_alg aesni_algs[] = { {
1444 .cra_name = "__xts-aes-aesni", 1506 .cra_name = "__xts-aes-aesni",
1445 .cra_driver_name = "__driver-xts-aes-aesni", 1507 .cra_driver_name = "__driver-xts-aes-aesni",
1446 .cra_priority = 0, 1508 .cra_priority = 0,
1447 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 1509 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
1510 CRYPTO_ALG_INTERNAL,
1448 .cra_blocksize = AES_BLOCK_SIZE, 1511 .cra_blocksize = AES_BLOCK_SIZE,
1449 .cra_ctxsize = sizeof(struct aesni_xts_ctx), 1512 .cra_ctxsize = sizeof(struct aesni_xts_ctx),
1450 .cra_alignmask = 0, 1513 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 9a07fafe3831..baf0ac21ace5 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -343,7 +343,8 @@ static struct crypto_alg cmll_algs[10] = { {
343 .cra_name = "__ecb-camellia-aesni-avx2", 343 .cra_name = "__ecb-camellia-aesni-avx2",
344 .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", 344 .cra_driver_name = "__driver-ecb-camellia-aesni-avx2",
345 .cra_priority = 0, 345 .cra_priority = 0,
346 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 346 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
347 CRYPTO_ALG_INTERNAL,
347 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 348 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
348 .cra_ctxsize = sizeof(struct camellia_ctx), 349 .cra_ctxsize = sizeof(struct camellia_ctx),
349 .cra_alignmask = 0, 350 .cra_alignmask = 0,
@@ -362,7 +363,8 @@ static struct crypto_alg cmll_algs[10] = { {
362 .cra_name = "__cbc-camellia-aesni-avx2", 363 .cra_name = "__cbc-camellia-aesni-avx2",
363 .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", 364 .cra_driver_name = "__driver-cbc-camellia-aesni-avx2",
364 .cra_priority = 0, 365 .cra_priority = 0,
365 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 366 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
367 CRYPTO_ALG_INTERNAL,
366 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 368 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
367 .cra_ctxsize = sizeof(struct camellia_ctx), 369 .cra_ctxsize = sizeof(struct camellia_ctx),
368 .cra_alignmask = 0, 370 .cra_alignmask = 0,
@@ -381,7 +383,8 @@ static struct crypto_alg cmll_algs[10] = { {
381 .cra_name = "__ctr-camellia-aesni-avx2", 383 .cra_name = "__ctr-camellia-aesni-avx2",
382 .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", 384 .cra_driver_name = "__driver-ctr-camellia-aesni-avx2",
383 .cra_priority = 0, 385 .cra_priority = 0,
384 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 386 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
387 CRYPTO_ALG_INTERNAL,
385 .cra_blocksize = 1, 388 .cra_blocksize = 1,
386 .cra_ctxsize = sizeof(struct camellia_ctx), 389 .cra_ctxsize = sizeof(struct camellia_ctx),
387 .cra_alignmask = 0, 390 .cra_alignmask = 0,
@@ -401,7 +404,8 @@ static struct crypto_alg cmll_algs[10] = { {
401 .cra_name = "__lrw-camellia-aesni-avx2", 404 .cra_name = "__lrw-camellia-aesni-avx2",
402 .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", 405 .cra_driver_name = "__driver-lrw-camellia-aesni-avx2",
403 .cra_priority = 0, 406 .cra_priority = 0,
404 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 407 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
408 CRYPTO_ALG_INTERNAL,
405 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 409 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
406 .cra_ctxsize = sizeof(struct camellia_lrw_ctx), 410 .cra_ctxsize = sizeof(struct camellia_lrw_ctx),
407 .cra_alignmask = 0, 411 .cra_alignmask = 0,
@@ -424,7 +428,8 @@ static struct crypto_alg cmll_algs[10] = { {
424 .cra_name = "__xts-camellia-aesni-avx2", 428 .cra_name = "__xts-camellia-aesni-avx2",
425 .cra_driver_name = "__driver-xts-camellia-aesni-avx2", 429 .cra_driver_name = "__driver-xts-camellia-aesni-avx2",
426 .cra_priority = 0, 430 .cra_priority = 0,
427 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 431 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
432 CRYPTO_ALG_INTERNAL,
428 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 433 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
429 .cra_ctxsize = sizeof(struct camellia_xts_ctx), 434 .cra_ctxsize = sizeof(struct camellia_xts_ctx),
430 .cra_alignmask = 0, 435 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index ed38d959add6..78818a1e73e3 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -335,7 +335,8 @@ static struct crypto_alg cmll_algs[10] = { {
335 .cra_name = "__ecb-camellia-aesni", 335 .cra_name = "__ecb-camellia-aesni",
336 .cra_driver_name = "__driver-ecb-camellia-aesni", 336 .cra_driver_name = "__driver-ecb-camellia-aesni",
337 .cra_priority = 0, 337 .cra_priority = 0,
338 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 338 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
339 CRYPTO_ALG_INTERNAL,
339 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 340 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
340 .cra_ctxsize = sizeof(struct camellia_ctx), 341 .cra_ctxsize = sizeof(struct camellia_ctx),
341 .cra_alignmask = 0, 342 .cra_alignmask = 0,
@@ -354,7 +355,8 @@ static struct crypto_alg cmll_algs[10] = { {
354 .cra_name = "__cbc-camellia-aesni", 355 .cra_name = "__cbc-camellia-aesni",
355 .cra_driver_name = "__driver-cbc-camellia-aesni", 356 .cra_driver_name = "__driver-cbc-camellia-aesni",
356 .cra_priority = 0, 357 .cra_priority = 0,
357 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 358 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
359 CRYPTO_ALG_INTERNAL,
358 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 360 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
359 .cra_ctxsize = sizeof(struct camellia_ctx), 361 .cra_ctxsize = sizeof(struct camellia_ctx),
360 .cra_alignmask = 0, 362 .cra_alignmask = 0,
@@ -373,7 +375,8 @@ static struct crypto_alg cmll_algs[10] = { {
373 .cra_name = "__ctr-camellia-aesni", 375 .cra_name = "__ctr-camellia-aesni",
374 .cra_driver_name = "__driver-ctr-camellia-aesni", 376 .cra_driver_name = "__driver-ctr-camellia-aesni",
375 .cra_priority = 0, 377 .cra_priority = 0,
376 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 378 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
379 CRYPTO_ALG_INTERNAL,
377 .cra_blocksize = 1, 380 .cra_blocksize = 1,
378 .cra_ctxsize = sizeof(struct camellia_ctx), 381 .cra_ctxsize = sizeof(struct camellia_ctx),
379 .cra_alignmask = 0, 382 .cra_alignmask = 0,
@@ -393,7 +396,8 @@ static struct crypto_alg cmll_algs[10] = { {
393 .cra_name = "__lrw-camellia-aesni", 396 .cra_name = "__lrw-camellia-aesni",
394 .cra_driver_name = "__driver-lrw-camellia-aesni", 397 .cra_driver_name = "__driver-lrw-camellia-aesni",
395 .cra_priority = 0, 398 .cra_priority = 0,
396 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 399 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
400 CRYPTO_ALG_INTERNAL,
397 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 401 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
398 .cra_ctxsize = sizeof(struct camellia_lrw_ctx), 402 .cra_ctxsize = sizeof(struct camellia_lrw_ctx),
399 .cra_alignmask = 0, 403 .cra_alignmask = 0,
@@ -416,7 +420,8 @@ static struct crypto_alg cmll_algs[10] = { {
416 .cra_name = "__xts-camellia-aesni", 420 .cra_name = "__xts-camellia-aesni",
417 .cra_driver_name = "__driver-xts-camellia-aesni", 421 .cra_driver_name = "__driver-xts-camellia-aesni",
418 .cra_priority = 0, 422 .cra_priority = 0,
419 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 423 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
424 CRYPTO_ALG_INTERNAL,
420 .cra_blocksize = CAMELLIA_BLOCK_SIZE, 425 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
421 .cra_ctxsize = sizeof(struct camellia_xts_ctx), 426 .cra_ctxsize = sizeof(struct camellia_xts_ctx),
422 .cra_alignmask = 0, 427 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 60ada677a928..236c80974457 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -341,7 +341,8 @@ static struct crypto_alg cast5_algs[6] = { {
341 .cra_name = "__ecb-cast5-avx", 341 .cra_name = "__ecb-cast5-avx",
342 .cra_driver_name = "__driver-ecb-cast5-avx", 342 .cra_driver_name = "__driver-ecb-cast5-avx",
343 .cra_priority = 0, 343 .cra_priority = 0,
344 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 344 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
345 CRYPTO_ALG_INTERNAL,
345 .cra_blocksize = CAST5_BLOCK_SIZE, 346 .cra_blocksize = CAST5_BLOCK_SIZE,
346 .cra_ctxsize = sizeof(struct cast5_ctx), 347 .cra_ctxsize = sizeof(struct cast5_ctx),
347 .cra_alignmask = 0, 348 .cra_alignmask = 0,
@@ -360,7 +361,8 @@ static struct crypto_alg cast5_algs[6] = { {
360 .cra_name = "__cbc-cast5-avx", 361 .cra_name = "__cbc-cast5-avx",
361 .cra_driver_name = "__driver-cbc-cast5-avx", 362 .cra_driver_name = "__driver-cbc-cast5-avx",
362 .cra_priority = 0, 363 .cra_priority = 0,
363 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 364 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
365 CRYPTO_ALG_INTERNAL,
364 .cra_blocksize = CAST5_BLOCK_SIZE, 366 .cra_blocksize = CAST5_BLOCK_SIZE,
365 .cra_ctxsize = sizeof(struct cast5_ctx), 367 .cra_ctxsize = sizeof(struct cast5_ctx),
366 .cra_alignmask = 0, 368 .cra_alignmask = 0,
@@ -379,7 +381,8 @@ static struct crypto_alg cast5_algs[6] = { {
379 .cra_name = "__ctr-cast5-avx", 381 .cra_name = "__ctr-cast5-avx",
380 .cra_driver_name = "__driver-ctr-cast5-avx", 382 .cra_driver_name = "__driver-ctr-cast5-avx",
381 .cra_priority = 0, 383 .cra_priority = 0,
382 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 384 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
385 CRYPTO_ALG_INTERNAL,
383 .cra_blocksize = 1, 386 .cra_blocksize = 1,
384 .cra_ctxsize = sizeof(struct cast5_ctx), 387 .cra_ctxsize = sizeof(struct cast5_ctx),
385 .cra_alignmask = 0, 388 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 0160f68a57ff..f448810ca4ac 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -372,7 +372,8 @@ static struct crypto_alg cast6_algs[10] = { {
372 .cra_name = "__ecb-cast6-avx", 372 .cra_name = "__ecb-cast6-avx",
373 .cra_driver_name = "__driver-ecb-cast6-avx", 373 .cra_driver_name = "__driver-ecb-cast6-avx",
374 .cra_priority = 0, 374 .cra_priority = 0,
375 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 375 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
376 CRYPTO_ALG_INTERNAL,
376 .cra_blocksize = CAST6_BLOCK_SIZE, 377 .cra_blocksize = CAST6_BLOCK_SIZE,
377 .cra_ctxsize = sizeof(struct cast6_ctx), 378 .cra_ctxsize = sizeof(struct cast6_ctx),
378 .cra_alignmask = 0, 379 .cra_alignmask = 0,
@@ -391,7 +392,8 @@ static struct crypto_alg cast6_algs[10] = { {
391 .cra_name = "__cbc-cast6-avx", 392 .cra_name = "__cbc-cast6-avx",
392 .cra_driver_name = "__driver-cbc-cast6-avx", 393 .cra_driver_name = "__driver-cbc-cast6-avx",
393 .cra_priority = 0, 394 .cra_priority = 0,
394 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 395 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
396 CRYPTO_ALG_INTERNAL,
395 .cra_blocksize = CAST6_BLOCK_SIZE, 397 .cra_blocksize = CAST6_BLOCK_SIZE,
396 .cra_ctxsize = sizeof(struct cast6_ctx), 398 .cra_ctxsize = sizeof(struct cast6_ctx),
397 .cra_alignmask = 0, 399 .cra_alignmask = 0,
@@ -410,7 +412,8 @@ static struct crypto_alg cast6_algs[10] = { {
410 .cra_name = "__ctr-cast6-avx", 412 .cra_name = "__ctr-cast6-avx",
411 .cra_driver_name = "__driver-ctr-cast6-avx", 413 .cra_driver_name = "__driver-ctr-cast6-avx",
412 .cra_priority = 0, 414 .cra_priority = 0,
413 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 415 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
416 CRYPTO_ALG_INTERNAL,
414 .cra_blocksize = 1, 417 .cra_blocksize = 1,
415 .cra_ctxsize = sizeof(struct cast6_ctx), 418 .cra_ctxsize = sizeof(struct cast6_ctx),
416 .cra_alignmask = 0, 419 .cra_alignmask = 0,
@@ -430,7 +433,8 @@ static struct crypto_alg cast6_algs[10] = { {
430 .cra_name = "__lrw-cast6-avx", 433 .cra_name = "__lrw-cast6-avx",
431 .cra_driver_name = "__driver-lrw-cast6-avx", 434 .cra_driver_name = "__driver-lrw-cast6-avx",
432 .cra_priority = 0, 435 .cra_priority = 0,
433 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 436 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
437 CRYPTO_ALG_INTERNAL,
434 .cra_blocksize = CAST6_BLOCK_SIZE, 438 .cra_blocksize = CAST6_BLOCK_SIZE,
435 .cra_ctxsize = sizeof(struct cast6_lrw_ctx), 439 .cra_ctxsize = sizeof(struct cast6_lrw_ctx),
436 .cra_alignmask = 0, 440 .cra_alignmask = 0,
@@ -453,7 +457,8 @@ static struct crypto_alg cast6_algs[10] = { {
453 .cra_name = "__xts-cast6-avx", 457 .cra_name = "__xts-cast6-avx",
454 .cra_driver_name = "__driver-xts-cast6-avx", 458 .cra_driver_name = "__driver-xts-cast6-avx",
455 .cra_priority = 0, 459 .cra_priority = 0,
456 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 460 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
461 CRYPTO_ALG_INTERNAL,
457 .cra_blocksize = CAST6_BLOCK_SIZE, 462 .cra_blocksize = CAST6_BLOCK_SIZE,
458 .cra_ctxsize = sizeof(struct cast6_xts_ctx), 463 .cra_ctxsize = sizeof(struct cast6_xts_ctx),
459 .cra_alignmask = 0, 464 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 26d49ebae040..225be06edc80 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -178,7 +178,7 @@ continue_block:
178 ## 2a) PROCESS FULL BLOCKS: 178 ## 2a) PROCESS FULL BLOCKS:
179 ################################################################ 179 ################################################################
180full_block: 180full_block:
181 movq $128,%rax 181 movl $128,%eax
182 lea 128*8*2(block_0), block_1 182 lea 128*8*2(block_0), block_1
183 lea 128*8*3(block_0), block_2 183 lea 128*8*3(block_0), block_2
184 add $128*8*1, block_0 184 add $128*8*1, block_0
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 8253d85aa165..2079baf06bdd 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = {
154 .cra_name = "__ghash", 154 .cra_name = "__ghash",
155 .cra_driver_name = "__ghash-pclmulqdqni", 155 .cra_driver_name = "__ghash-pclmulqdqni",
156 .cra_priority = 0, 156 .cra_priority = 0,
157 .cra_flags = CRYPTO_ALG_TYPE_SHASH, 157 .cra_flags = CRYPTO_ALG_TYPE_SHASH |
158 CRYPTO_ALG_INTERNAL,
158 .cra_blocksize = GHASH_BLOCK_SIZE, 159 .cra_blocksize = GHASH_BLOCK_SIZE,
159 .cra_ctxsize = sizeof(struct ghash_ctx), 160 .cra_ctxsize = sizeof(struct ghash_ctx),
160 .cra_module = THIS_MODULE, 161 .cra_module = THIS_MODULE,
@@ -261,7 +262,9 @@ static int ghash_async_init_tfm(struct crypto_tfm *tfm)
261 struct cryptd_ahash *cryptd_tfm; 262 struct cryptd_ahash *cryptd_tfm;
262 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); 263 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
263 264
264 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); 265 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
266 CRYPTO_ALG_INTERNAL,
267 CRYPTO_ALG_INTERNAL);
265 if (IS_ERR(cryptd_tfm)) 268 if (IS_ERR(cryptd_tfm))
266 return PTR_ERR(cryptd_tfm); 269 return PTR_ERR(cryptd_tfm);
267 ctx->cryptd_tfm = cryptd_tfm; 270 ctx->cryptd_tfm = cryptd_tfm;
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 432f1d76ceb8..6a85598931b5 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -232,7 +232,6 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
232 232
233 le128_to_be128((be128 *)walk->iv, &ctrblk); 233 le128_to_be128((be128 *)walk->iv, &ctrblk);
234} 234}
235EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
236 235
237static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, 236static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
238 struct blkcipher_desc *desc, 237 struct blkcipher_desc *desc,
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 437e47a4d302..2f63dc89e7a9 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -309,7 +309,8 @@ static struct crypto_alg srp_algs[10] = { {
309 .cra_name = "__ecb-serpent-avx2", 309 .cra_name = "__ecb-serpent-avx2",
310 .cra_driver_name = "__driver-ecb-serpent-avx2", 310 .cra_driver_name = "__driver-ecb-serpent-avx2",
311 .cra_priority = 0, 311 .cra_priority = 0,
312 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 312 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
313 CRYPTO_ALG_INTERNAL,
313 .cra_blocksize = SERPENT_BLOCK_SIZE, 314 .cra_blocksize = SERPENT_BLOCK_SIZE,
314 .cra_ctxsize = sizeof(struct serpent_ctx), 315 .cra_ctxsize = sizeof(struct serpent_ctx),
315 .cra_alignmask = 0, 316 .cra_alignmask = 0,
@@ -329,7 +330,8 @@ static struct crypto_alg srp_algs[10] = { {
329 .cra_name = "__cbc-serpent-avx2", 330 .cra_name = "__cbc-serpent-avx2",
330 .cra_driver_name = "__driver-cbc-serpent-avx2", 331 .cra_driver_name = "__driver-cbc-serpent-avx2",
331 .cra_priority = 0, 332 .cra_priority = 0,
332 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 333 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
334 CRYPTO_ALG_INTERNAL,
333 .cra_blocksize = SERPENT_BLOCK_SIZE, 335 .cra_blocksize = SERPENT_BLOCK_SIZE,
334 .cra_ctxsize = sizeof(struct serpent_ctx), 336 .cra_ctxsize = sizeof(struct serpent_ctx),
335 .cra_alignmask = 0, 337 .cra_alignmask = 0,
@@ -349,7 +351,8 @@ static struct crypto_alg srp_algs[10] = { {
349 .cra_name = "__ctr-serpent-avx2", 351 .cra_name = "__ctr-serpent-avx2",
350 .cra_driver_name = "__driver-ctr-serpent-avx2", 352 .cra_driver_name = "__driver-ctr-serpent-avx2",
351 .cra_priority = 0, 353 .cra_priority = 0,
352 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 354 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
355 CRYPTO_ALG_INTERNAL,
353 .cra_blocksize = 1, 356 .cra_blocksize = 1,
354 .cra_ctxsize = sizeof(struct serpent_ctx), 357 .cra_ctxsize = sizeof(struct serpent_ctx),
355 .cra_alignmask = 0, 358 .cra_alignmask = 0,
@@ -370,7 +373,8 @@ static struct crypto_alg srp_algs[10] = { {
370 .cra_name = "__lrw-serpent-avx2", 373 .cra_name = "__lrw-serpent-avx2",
371 .cra_driver_name = "__driver-lrw-serpent-avx2", 374 .cra_driver_name = "__driver-lrw-serpent-avx2",
372 .cra_priority = 0, 375 .cra_priority = 0,
373 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 376 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
377 CRYPTO_ALG_INTERNAL,
374 .cra_blocksize = SERPENT_BLOCK_SIZE, 378 .cra_blocksize = SERPENT_BLOCK_SIZE,
375 .cra_ctxsize = sizeof(struct serpent_lrw_ctx), 379 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
376 .cra_alignmask = 0, 380 .cra_alignmask = 0,
@@ -394,7 +398,8 @@ static struct crypto_alg srp_algs[10] = { {
394 .cra_name = "__xts-serpent-avx2", 398 .cra_name = "__xts-serpent-avx2",
395 .cra_driver_name = "__driver-xts-serpent-avx2", 399 .cra_driver_name = "__driver-xts-serpent-avx2",
396 .cra_priority = 0, 400 .cra_priority = 0,
397 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 401 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
402 CRYPTO_ALG_INTERNAL,
398 .cra_blocksize = SERPENT_BLOCK_SIZE, 403 .cra_blocksize = SERPENT_BLOCK_SIZE,
399 .cra_ctxsize = sizeof(struct serpent_xts_ctx), 404 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
400 .cra_alignmask = 0, 405 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 7e217398b4eb..c8d478af8456 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -378,7 +378,8 @@ static struct crypto_alg serpent_algs[10] = { {
378 .cra_name = "__ecb-serpent-avx", 378 .cra_name = "__ecb-serpent-avx",
379 .cra_driver_name = "__driver-ecb-serpent-avx", 379 .cra_driver_name = "__driver-ecb-serpent-avx",
380 .cra_priority = 0, 380 .cra_priority = 0,
381 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 381 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
382 CRYPTO_ALG_INTERNAL,
382 .cra_blocksize = SERPENT_BLOCK_SIZE, 383 .cra_blocksize = SERPENT_BLOCK_SIZE,
383 .cra_ctxsize = sizeof(struct serpent_ctx), 384 .cra_ctxsize = sizeof(struct serpent_ctx),
384 .cra_alignmask = 0, 385 .cra_alignmask = 0,
@@ -397,7 +398,8 @@ static struct crypto_alg serpent_algs[10] = { {
397 .cra_name = "__cbc-serpent-avx", 398 .cra_name = "__cbc-serpent-avx",
398 .cra_driver_name = "__driver-cbc-serpent-avx", 399 .cra_driver_name = "__driver-cbc-serpent-avx",
399 .cra_priority = 0, 400 .cra_priority = 0,
400 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 401 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
402 CRYPTO_ALG_INTERNAL,
401 .cra_blocksize = SERPENT_BLOCK_SIZE, 403 .cra_blocksize = SERPENT_BLOCK_SIZE,
402 .cra_ctxsize = sizeof(struct serpent_ctx), 404 .cra_ctxsize = sizeof(struct serpent_ctx),
403 .cra_alignmask = 0, 405 .cra_alignmask = 0,
@@ -416,7 +418,8 @@ static struct crypto_alg serpent_algs[10] = { {
416 .cra_name = "__ctr-serpent-avx", 418 .cra_name = "__ctr-serpent-avx",
417 .cra_driver_name = "__driver-ctr-serpent-avx", 419 .cra_driver_name = "__driver-ctr-serpent-avx",
418 .cra_priority = 0, 420 .cra_priority = 0,
419 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 421 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
422 CRYPTO_ALG_INTERNAL,
420 .cra_blocksize = 1, 423 .cra_blocksize = 1,
421 .cra_ctxsize = sizeof(struct serpent_ctx), 424 .cra_ctxsize = sizeof(struct serpent_ctx),
422 .cra_alignmask = 0, 425 .cra_alignmask = 0,
@@ -436,7 +439,8 @@ static struct crypto_alg serpent_algs[10] = { {
436 .cra_name = "__lrw-serpent-avx", 439 .cra_name = "__lrw-serpent-avx",
437 .cra_driver_name = "__driver-lrw-serpent-avx", 440 .cra_driver_name = "__driver-lrw-serpent-avx",
438 .cra_priority = 0, 441 .cra_priority = 0,
439 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 442 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
443 CRYPTO_ALG_INTERNAL,
440 .cra_blocksize = SERPENT_BLOCK_SIZE, 444 .cra_blocksize = SERPENT_BLOCK_SIZE,
441 .cra_ctxsize = sizeof(struct serpent_lrw_ctx), 445 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
442 .cra_alignmask = 0, 446 .cra_alignmask = 0,
@@ -459,7 +463,8 @@ static struct crypto_alg serpent_algs[10] = { {
459 .cra_name = "__xts-serpent-avx", 463 .cra_name = "__xts-serpent-avx",
460 .cra_driver_name = "__driver-xts-serpent-avx", 464 .cra_driver_name = "__driver-xts-serpent-avx",
461 .cra_priority = 0, 465 .cra_priority = 0,
462 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 466 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
467 CRYPTO_ALG_INTERNAL,
463 .cra_blocksize = SERPENT_BLOCK_SIZE, 468 .cra_blocksize = SERPENT_BLOCK_SIZE,
464 .cra_ctxsize = sizeof(struct serpent_xts_ctx), 469 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
465 .cra_alignmask = 0, 470 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index bf025adaea01..3643dd508f45 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -387,7 +387,8 @@ static struct crypto_alg serpent_algs[10] = { {
387 .cra_name = "__ecb-serpent-sse2", 387 .cra_name = "__ecb-serpent-sse2",
388 .cra_driver_name = "__driver-ecb-serpent-sse2", 388 .cra_driver_name = "__driver-ecb-serpent-sse2",
389 .cra_priority = 0, 389 .cra_priority = 0,
390 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 390 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
391 CRYPTO_ALG_INTERNAL,
391 .cra_blocksize = SERPENT_BLOCK_SIZE, 392 .cra_blocksize = SERPENT_BLOCK_SIZE,
392 .cra_ctxsize = sizeof(struct serpent_ctx), 393 .cra_ctxsize = sizeof(struct serpent_ctx),
393 .cra_alignmask = 0, 394 .cra_alignmask = 0,
@@ -406,7 +407,8 @@ static struct crypto_alg serpent_algs[10] = { {
406 .cra_name = "__cbc-serpent-sse2", 407 .cra_name = "__cbc-serpent-sse2",
407 .cra_driver_name = "__driver-cbc-serpent-sse2", 408 .cra_driver_name = "__driver-cbc-serpent-sse2",
408 .cra_priority = 0, 409 .cra_priority = 0,
409 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 410 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
411 CRYPTO_ALG_INTERNAL,
410 .cra_blocksize = SERPENT_BLOCK_SIZE, 412 .cra_blocksize = SERPENT_BLOCK_SIZE,
411 .cra_ctxsize = sizeof(struct serpent_ctx), 413 .cra_ctxsize = sizeof(struct serpent_ctx),
412 .cra_alignmask = 0, 414 .cra_alignmask = 0,
@@ -425,7 +427,8 @@ static struct crypto_alg serpent_algs[10] = { {
425 .cra_name = "__ctr-serpent-sse2", 427 .cra_name = "__ctr-serpent-sse2",
426 .cra_driver_name = "__driver-ctr-serpent-sse2", 428 .cra_driver_name = "__driver-ctr-serpent-sse2",
427 .cra_priority = 0, 429 .cra_priority = 0,
428 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 430 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
431 CRYPTO_ALG_INTERNAL,
429 .cra_blocksize = 1, 432 .cra_blocksize = 1,
430 .cra_ctxsize = sizeof(struct serpent_ctx), 433 .cra_ctxsize = sizeof(struct serpent_ctx),
431 .cra_alignmask = 0, 434 .cra_alignmask = 0,
@@ -445,7 +448,8 @@ static struct crypto_alg serpent_algs[10] = { {
445 .cra_name = "__lrw-serpent-sse2", 448 .cra_name = "__lrw-serpent-sse2",
446 .cra_driver_name = "__driver-lrw-serpent-sse2", 449 .cra_driver_name = "__driver-lrw-serpent-sse2",
447 .cra_priority = 0, 450 .cra_priority = 0,
448 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 451 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
452 CRYPTO_ALG_INTERNAL,
449 .cra_blocksize = SERPENT_BLOCK_SIZE, 453 .cra_blocksize = SERPENT_BLOCK_SIZE,
450 .cra_ctxsize = sizeof(struct serpent_lrw_ctx), 454 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
451 .cra_alignmask = 0, 455 .cra_alignmask = 0,
@@ -468,7 +472,8 @@ static struct crypto_alg serpent_algs[10] = { {
468 .cra_name = "__xts-serpent-sse2", 472 .cra_name = "__xts-serpent-sse2",
469 .cra_driver_name = "__driver-xts-serpent-sse2", 473 .cra_driver_name = "__driver-xts-serpent-sse2",
470 .cra_priority = 0, 474 .cra_priority = 0,
471 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 475 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
476 CRYPTO_ALG_INTERNAL,
472 .cra_blocksize = SERPENT_BLOCK_SIZE, 477 .cra_blocksize = SERPENT_BLOCK_SIZE,
473 .cra_ctxsize = sizeof(struct serpent_xts_ctx), 478 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
474 .cra_alignmask = 0, 479 .cra_alignmask = 0,
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index fd9f6b035b16..e510b1c5d690 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -694,7 +694,8 @@ static struct shash_alg sha1_mb_shash_alg = {
694 * use ASYNC flag as some buffers in multi-buffer 694 * use ASYNC flag as some buffers in multi-buffer
695 * algo may not have completed before hashing thread sleep 695 * algo may not have completed before hashing thread sleep
696 */ 696 */
697 .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC, 697 .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC |
698 CRYPTO_ALG_INTERNAL,
698 .cra_blocksize = SHA1_BLOCK_SIZE, 699 .cra_blocksize = SHA1_BLOCK_SIZE,
699 .cra_module = THIS_MODULE, 700 .cra_module = THIS_MODULE,
700 .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), 701 .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list),
@@ -770,7 +771,9 @@ static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
770 struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); 771 struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
771 struct mcryptd_hash_ctx *mctx; 772 struct mcryptd_hash_ctx *mctx;
772 773
773 mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0); 774 mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
775 CRYPTO_ALG_INTERNAL,
776 CRYPTO_ALG_INTERNAL);
774 if (IS_ERR(mcryptd_tfm)) 777 if (IS_ERR(mcryptd_tfm))
775 return PTR_ERR(mcryptd_tfm); 778 return PTR_ERR(mcryptd_tfm);
776 mctx = crypto_ahash_ctx(&mcryptd_tfm->base); 779 mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
@@ -828,7 +831,7 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
828 while (!list_empty(&cstate->work_list)) { 831 while (!list_empty(&cstate->work_list)) {
829 rctx = list_entry(cstate->work_list.next, 832 rctx = list_entry(cstate->work_list.next,
830 struct mcryptd_hash_request_ctx, waiter); 833 struct mcryptd_hash_request_ctx, waiter);
831 if time_before(cur_time, rctx->tag.expire) 834 if (time_before(cur_time, rctx->tag.expire))
832 break; 835 break;
833 kernel_fpu_begin(); 836 kernel_fpu_begin();
834 sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); 837 sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr);
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
index 4ca7e166a2aa..822acb5b464c 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -56,7 +56,7 @@
56void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) 56void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
57{ 57{
58 unsigned int j; 58 unsigned int j;
59 state->unused_lanes = 0xF76543210; 59 state->unused_lanes = 0xF76543210ULL;
60 for (j = 0; j < 8; j++) { 60 for (j = 0; j < 8; j++) {
61 state->lens[j] = 0xFFFFFFFF; 61 state->lens[j] = 0xFFFFFFFF;
62 state->ldata[j].job_in_lane = NULL; 62 state->ldata[j].job_in_lane = NULL;
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 6c20fe04a738..33d1b9dc14cc 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -28,7 +28,7 @@
28#include <linux/cryptohash.h> 28#include <linux/cryptohash.h>
29#include <linux/types.h> 29#include <linux/types.h>
30#include <crypto/sha.h> 30#include <crypto/sha.h>
31#include <asm/byteorder.h> 31#include <crypto/sha1_base.h>
32#include <asm/i387.h> 32#include <asm/i387.h>
33#include <asm/xcr.h> 33#include <asm/xcr.h>
34#include <asm/xsave.h> 34#include <asm/xsave.h>
@@ -44,132 +44,51 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
44#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ 44#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
45 45
46asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, 46asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
47 unsigned int rounds); 47 unsigned int rounds);
48#endif 48#endif
49 49
50static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); 50static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
51
52
53static int sha1_ssse3_init(struct shash_desc *desc)
54{
55 struct sha1_state *sctx = shash_desc_ctx(desc);
56
57 *sctx = (struct sha1_state){
58 .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
59 };
60
61 return 0;
62}
63
64static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
65 unsigned int len, unsigned int partial)
66{
67 struct sha1_state *sctx = shash_desc_ctx(desc);
68 unsigned int done = 0;
69
70 sctx->count += len;
71
72 if (partial) {
73 done = SHA1_BLOCK_SIZE - partial;
74 memcpy(sctx->buffer + partial, data, done);
75 sha1_transform_asm(sctx->state, sctx->buffer, 1);
76 }
77
78 if (len - done >= SHA1_BLOCK_SIZE) {
79 const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
80
81 sha1_transform_asm(sctx->state, data + done, rounds);
82 done += rounds * SHA1_BLOCK_SIZE;
83 }
84
85 memcpy(sctx->buffer, data + done, len - done);
86
87 return 0;
88}
89 51
90static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, 52static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
91 unsigned int len) 53 unsigned int len)
92{ 54{
93 struct sha1_state *sctx = shash_desc_ctx(desc); 55 struct sha1_state *sctx = shash_desc_ctx(desc);
94 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
95 int res;
96 56
97 /* Handle the fast case right here */ 57 if (!irq_fpu_usable() ||
98 if (partial + len < SHA1_BLOCK_SIZE) { 58 (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
99 sctx->count += len; 59 return crypto_sha1_update(desc, data, len);
100 memcpy(sctx->buffer + partial, data, len);
101 60
102 return 0; 61 /* make sure casting to sha1_block_fn() is safe */
103 } 62 BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
104 63
105 if (!irq_fpu_usable()) { 64 kernel_fpu_begin();
106 res = crypto_sha1_update(desc, data, len); 65 sha1_base_do_update(desc, data, len,
107 } else { 66 (sha1_block_fn *)sha1_transform_asm);
108 kernel_fpu_begin(); 67 kernel_fpu_end();
109 res = __sha1_ssse3_update(desc, data, len, partial);
110 kernel_fpu_end();
111 }
112
113 return res;
114}
115
116
117/* Add padding and return the message digest. */
118static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
119{
120 struct sha1_state *sctx = shash_desc_ctx(desc);
121 unsigned int i, index, padlen;
122 __be32 *dst = (__be32 *)out;
123 __be64 bits;
124 static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
125
126 bits = cpu_to_be64(sctx->count << 3);
127
128 /* Pad out to 56 mod 64 and append length */
129 index = sctx->count % SHA1_BLOCK_SIZE;
130 padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
131 if (!irq_fpu_usable()) {
132 crypto_sha1_update(desc, padding, padlen);
133 crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
134 } else {
135 kernel_fpu_begin();
136 /* We need to fill a whole block for __sha1_ssse3_update() */
137 if (padlen <= 56) {
138 sctx->count += padlen;
139 memcpy(sctx->buffer + index, padding, padlen);
140 } else {
141 __sha1_ssse3_update(desc, padding, padlen, index);
142 }
143 __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
144 kernel_fpu_end();
145 }
146
147 /* Store state in digest */
148 for (i = 0; i < 5; i++)
149 dst[i] = cpu_to_be32(sctx->state[i]);
150
151 /* Wipe context */
152 memset(sctx, 0, sizeof(*sctx));
153 68
154 return 0; 69 return 0;
155} 70}
156 71
157static int sha1_ssse3_export(struct shash_desc *desc, void *out) 72static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
73 unsigned int len, u8 *out)
158{ 74{
159 struct sha1_state *sctx = shash_desc_ctx(desc); 75 if (!irq_fpu_usable())
76 return crypto_sha1_finup(desc, data, len, out);
160 77
161 memcpy(out, sctx, sizeof(*sctx)); 78 kernel_fpu_begin();
79 if (len)
80 sha1_base_do_update(desc, data, len,
81 (sha1_block_fn *)sha1_transform_asm);
82 sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm);
83 kernel_fpu_end();
162 84
163 return 0; 85 return sha1_base_finish(desc, out);
164} 86}
165 87
166static int sha1_ssse3_import(struct shash_desc *desc, const void *in) 88/* Add padding and return the message digest. */
89static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
167{ 90{
168 struct sha1_state *sctx = shash_desc_ctx(desc); 91 return sha1_ssse3_finup(desc, NULL, 0, out);
169
170 memcpy(sctx, in, sizeof(*sctx));
171
172 return 0;
173} 92}
174 93
175#ifdef CONFIG_AS_AVX2 94#ifdef CONFIG_AS_AVX2
@@ -186,13 +105,11 @@ static void sha1_apply_transform_avx2(u32 *digest, const char *data,
186 105
187static struct shash_alg alg = { 106static struct shash_alg alg = {
188 .digestsize = SHA1_DIGEST_SIZE, 107 .digestsize = SHA1_DIGEST_SIZE,
189 .init = sha1_ssse3_init, 108 .init = sha1_base_init,
190 .update = sha1_ssse3_update, 109 .update = sha1_ssse3_update,
191 .final = sha1_ssse3_final, 110 .final = sha1_ssse3_final,
192 .export = sha1_ssse3_export, 111 .finup = sha1_ssse3_finup,
193 .import = sha1_ssse3_import,
194 .descsize = sizeof(struct sha1_state), 112 .descsize = sizeof(struct sha1_state),
195 .statesize = sizeof(struct sha1_state),
196 .base = { 113 .base = {
197 .cra_name = "sha1", 114 .cra_name = "sha1",
198 .cra_driver_name= "sha1-ssse3", 115 .cra_driver_name= "sha1-ssse3",
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 642f15687a0a..92b3b5d75ba9 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -96,10 +96,10 @@ SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
96BYTE_FLIP_MASK = %xmm13 96BYTE_FLIP_MASK = %xmm13
97 97
98NUM_BLKS = %rdx # 3rd arg 98NUM_BLKS = %rdx # 3rd arg
99CTX = %rsi # 2nd arg 99INP = %rsi # 2nd arg
100INP = %rdi # 1st arg 100CTX = %rdi # 1st arg
101 101
102SRND = %rdi # clobbers INP 102SRND = %rsi # clobbers INP
103c = %ecx 103c = %ecx
104d = %r8d 104d = %r8d
105e = %edx 105e = %edx
@@ -342,8 +342,8 @@ a = TMP_
342 342
343######################################################################## 343########################################################################
344## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) 344## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
345## arg 1 : pointer to input data 345## arg 1 : pointer to digest
346## arg 2 : pointer to digest 346## arg 2 : pointer to input data
347## arg 3 : Num blocks 347## arg 3 : Num blocks
348######################################################################## 348########################################################################
349.text 349.text
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9e86944c539d..570ec5ec62d7 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -91,12 +91,12 @@ BYTE_FLIP_MASK = %ymm13
91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92 92
93NUM_BLKS = %rdx # 3rd arg 93NUM_BLKS = %rdx # 3rd arg
94CTX = %rsi # 2nd arg 94INP = %rsi # 2nd arg
95INP = %rdi # 1st arg 95CTX = %rdi # 1st arg
96c = %ecx 96c = %ecx
97d = %r8d 97d = %r8d
98e = %edx # clobbers NUM_BLKS 98e = %edx # clobbers NUM_BLKS
99y3 = %edi # clobbers INP 99y3 = %esi # clobbers INP
100 100
101 101
102TBL = %rbp 102TBL = %rbp
@@ -523,8 +523,8 @@ STACK_SIZE = _RSP + _RSP_SIZE
523 523
524######################################################################## 524########################################################################
525## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) 525## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
526## arg 1 : pointer to input data 526## arg 1 : pointer to digest
527## arg 2 : pointer to digest 527## arg 2 : pointer to input data
528## arg 3 : Num blocks 528## arg 3 : Num blocks
529######################################################################## 529########################################################################
530.text 530.text
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index f833b74d902b..2cedc44e8121 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -88,10 +88,10 @@ SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
88BYTE_FLIP_MASK = %xmm12 88BYTE_FLIP_MASK = %xmm12
89 89
90NUM_BLKS = %rdx # 3rd arg 90NUM_BLKS = %rdx # 3rd arg
91CTX = %rsi # 2nd arg 91INP = %rsi # 2nd arg
92INP = %rdi # 1st arg 92CTX = %rdi # 1st arg
93 93
94SRND = %rdi # clobbers INP 94SRND = %rsi # clobbers INP
95c = %ecx 95c = %ecx
96d = %r8d 96d = %r8d
97e = %edx 97e = %edx
@@ -348,8 +348,8 @@ a = TMP_
348 348
349######################################################################## 349########################################################################
350## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) 350## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
351## arg 1 : pointer to input data 351## arg 1 : pointer to digest
352## arg 2 : pointer to digest 352## arg 2 : pointer to input data
353## arg 3 : Num blocks 353## arg 3 : Num blocks
354######################################################################## 354########################################################################
355.text 355.text
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 8fad72f4dfd2..ccc338881ee8 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -36,195 +36,74 @@
36#include <linux/cryptohash.h> 36#include <linux/cryptohash.h>
37#include <linux/types.h> 37#include <linux/types.h>
38#include <crypto/sha.h> 38#include <crypto/sha.h>
39#include <asm/byteorder.h> 39#include <crypto/sha256_base.h>
40#include <asm/i387.h> 40#include <asm/i387.h>
41#include <asm/xcr.h> 41#include <asm/xcr.h>
42#include <asm/xsave.h> 42#include <asm/xsave.h>
43#include <linux/string.h> 43#include <linux/string.h>
44 44
45asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest, 45asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
46 u64 rounds); 46 u64 rounds);
47#ifdef CONFIG_AS_AVX 47#ifdef CONFIG_AS_AVX
48asmlinkage void sha256_transform_avx(const char *data, u32 *digest, 48asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
49 u64 rounds); 49 u64 rounds);
50#endif 50#endif
51#ifdef CONFIG_AS_AVX2 51#ifdef CONFIG_AS_AVX2
52asmlinkage void sha256_transform_rorx(const char *data, u32 *digest, 52asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
53 u64 rounds); 53 u64 rounds);
54#endif 54#endif
55 55
56static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64); 56static void (*sha256_transform_asm)(u32 *, const char *, u64);
57
58
59static int sha256_ssse3_init(struct shash_desc *desc)
60{
61 struct sha256_state *sctx = shash_desc_ctx(desc);
62
63 sctx->state[0] = SHA256_H0;
64 sctx->state[1] = SHA256_H1;
65 sctx->state[2] = SHA256_H2;
66 sctx->state[3] = SHA256_H3;
67 sctx->state[4] = SHA256_H4;
68 sctx->state[5] = SHA256_H5;
69 sctx->state[6] = SHA256_H6;
70 sctx->state[7] = SHA256_H7;
71 sctx->count = 0;
72
73 return 0;
74}
75
76static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
77 unsigned int len, unsigned int partial)
78{
79 struct sha256_state *sctx = shash_desc_ctx(desc);
80 unsigned int done = 0;
81
82 sctx->count += len;
83
84 if (partial) {
85 done = SHA256_BLOCK_SIZE - partial;
86 memcpy(sctx->buf + partial, data, done);
87 sha256_transform_asm(sctx->buf, sctx->state, 1);
88 }
89
90 if (len - done >= SHA256_BLOCK_SIZE) {
91 const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
92
93 sha256_transform_asm(data + done, sctx->state, (u64) rounds);
94
95 done += rounds * SHA256_BLOCK_SIZE;
96 }
97
98 memcpy(sctx->buf, data + done, len - done);
99
100 return 0;
101}
102 57
103static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, 58static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
104 unsigned int len) 59 unsigned int len)
105{ 60{
106 struct sha256_state *sctx = shash_desc_ctx(desc); 61 struct sha256_state *sctx = shash_desc_ctx(desc);
107 unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
108 int res;
109 62
110 /* Handle the fast case right here */ 63 if (!irq_fpu_usable() ||
111 if (partial + len < SHA256_BLOCK_SIZE) { 64 (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
112 sctx->count += len; 65 return crypto_sha256_update(desc, data, len);
113 memcpy(sctx->buf + partial, data, len);
114 66
115 return 0; 67 /* make sure casting to sha256_block_fn() is safe */
116 } 68 BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
117
118 if (!irq_fpu_usable()) {
119 res = crypto_sha256_update(desc, data, len);
120 } else {
121 kernel_fpu_begin();
122 res = __sha256_ssse3_update(desc, data, len, partial);
123 kernel_fpu_end();
124 }
125
126 return res;
127}
128 69
129 70 kernel_fpu_begin();
130/* Add padding and return the message digest. */ 71 sha256_base_do_update(desc, data, len,
131static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) 72 (sha256_block_fn *)sha256_transform_asm);
132{ 73 kernel_fpu_end();
133 struct sha256_state *sctx = shash_desc_ctx(desc);
134 unsigned int i, index, padlen;
135 __be32 *dst = (__be32 *)out;
136 __be64 bits;
137 static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
138
139 bits = cpu_to_be64(sctx->count << 3);
140
141 /* Pad out to 56 mod 64 and append length */
142 index = sctx->count % SHA256_BLOCK_SIZE;
143 padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
144
145 if (!irq_fpu_usable()) {
146 crypto_sha256_update(desc, padding, padlen);
147 crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
148 } else {
149 kernel_fpu_begin();
150 /* We need to fill a whole block for __sha256_ssse3_update() */
151 if (padlen <= 56) {
152 sctx->count += padlen;
153 memcpy(sctx->buf + index, padding, padlen);
154 } else {
155 __sha256_ssse3_update(desc, padding, padlen, index);
156 }
157 __sha256_ssse3_update(desc, (const u8 *)&bits,
158 sizeof(bits), 56);
159 kernel_fpu_end();
160 }
161
162 /* Store state in digest */
163 for (i = 0; i < 8; i++)
164 dst[i] = cpu_to_be32(sctx->state[i]);
165
166 /* Wipe context */
167 memset(sctx, 0, sizeof(*sctx));
168 74
169 return 0; 75 return 0;
170} 76}
171 77
172static int sha256_ssse3_export(struct shash_desc *desc, void *out) 78static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
79 unsigned int len, u8 *out)
173{ 80{
174 struct sha256_state *sctx = shash_desc_ctx(desc); 81 if (!irq_fpu_usable())
82 return crypto_sha256_finup(desc, data, len, out);
175 83
176 memcpy(out, sctx, sizeof(*sctx)); 84 kernel_fpu_begin();
85 if (len)
86 sha256_base_do_update(desc, data, len,
87 (sha256_block_fn *)sha256_transform_asm);
88 sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm);
89 kernel_fpu_end();
177 90
178 return 0; 91 return sha256_base_finish(desc, out);
179} 92}
180 93
181static int sha256_ssse3_import(struct shash_desc *desc, const void *in) 94/* Add padding and return the message digest. */
182{ 95static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
183 struct sha256_state *sctx = shash_desc_ctx(desc);
184
185 memcpy(sctx, in, sizeof(*sctx));
186
187 return 0;
188}
189
190static int sha224_ssse3_init(struct shash_desc *desc)
191{
192 struct sha256_state *sctx = shash_desc_ctx(desc);
193
194 sctx->state[0] = SHA224_H0;
195 sctx->state[1] = SHA224_H1;
196 sctx->state[2] = SHA224_H2;
197 sctx->state[3] = SHA224_H3;
198 sctx->state[4] = SHA224_H4;
199 sctx->state[5] = SHA224_H5;
200 sctx->state[6] = SHA224_H6;
201 sctx->state[7] = SHA224_H7;
202 sctx->count = 0;
203
204 return 0;
205}
206
207static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
208{ 96{
209 u8 D[SHA256_DIGEST_SIZE]; 97 return sha256_ssse3_finup(desc, NULL, 0, out);
210
211 sha256_ssse3_final(desc, D);
212
213 memcpy(hash, D, SHA224_DIGEST_SIZE);
214 memzero_explicit(D, SHA256_DIGEST_SIZE);
215
216 return 0;
217} 98}
218 99
219static struct shash_alg algs[] = { { 100static struct shash_alg algs[] = { {
220 .digestsize = SHA256_DIGEST_SIZE, 101 .digestsize = SHA256_DIGEST_SIZE,
221 .init = sha256_ssse3_init, 102 .init = sha256_base_init,
222 .update = sha256_ssse3_update, 103 .update = sha256_ssse3_update,
223 .final = sha256_ssse3_final, 104 .final = sha256_ssse3_final,
224 .export = sha256_ssse3_export, 105 .finup = sha256_ssse3_finup,
225 .import = sha256_ssse3_import,
226 .descsize = sizeof(struct sha256_state), 106 .descsize = sizeof(struct sha256_state),
227 .statesize = sizeof(struct sha256_state),
228 .base = { 107 .base = {
229 .cra_name = "sha256", 108 .cra_name = "sha256",
230 .cra_driver_name = "sha256-ssse3", 109 .cra_driver_name = "sha256-ssse3",
@@ -235,13 +114,11 @@ static struct shash_alg algs[] = { {
235 } 114 }
236}, { 115}, {
237 .digestsize = SHA224_DIGEST_SIZE, 116 .digestsize = SHA224_DIGEST_SIZE,
238 .init = sha224_ssse3_init, 117 .init = sha224_base_init,
239 .update = sha256_ssse3_update, 118 .update = sha256_ssse3_update,
240 .final = sha224_ssse3_final, 119 .final = sha256_ssse3_final,
241 .export = sha256_ssse3_export, 120 .finup = sha256_ssse3_finup,
242 .import = sha256_ssse3_import,
243 .descsize = sizeof(struct sha256_state), 121 .descsize = sizeof(struct sha256_state),
244 .statesize = sizeof(struct sha256_state),
245 .base = { 122 .base = {
246 .cra_name = "sha224", 123 .cra_name = "sha224",
247 .cra_driver_name = "sha224-ssse3", 124 .cra_driver_name = "sha224-ssse3",
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 974dde9bc6cd..565274d6a641 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -54,9 +54,9 @@
54 54
55# Virtual Registers 55# Virtual Registers
56# ARG1 56# ARG1
57msg = %rdi 57digest = %rdi
58# ARG2 58# ARG2
59digest = %rsi 59msg = %rsi
60# ARG3 60# ARG3
61msglen = %rdx 61msglen = %rdx
62T1 = %rcx 62T1 = %rcx
@@ -271,7 +271,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
271.endm 271.endm
272 272
273######################################################################## 273########################################################################
274# void sha512_transform_avx(const void* M, void* D, u64 L) 274# void sha512_transform_avx(void* D, const void* M, u64 L)
275# Purpose: Updates the SHA512 digest stored at D with the message stored in M. 275# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
276# The size of the message pointed to by M must be an integer multiple of SHA512 276# The size of the message pointed to by M must be an integer multiple of SHA512
277# message blocks. 277# message blocks.
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 568b96105f5c..a4771dcd1fcf 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -70,9 +70,9 @@ XFER = YTMP0
70BYTE_FLIP_MASK = %ymm9 70BYTE_FLIP_MASK = %ymm9
71 71
72# 1st arg 72# 1st arg
73INP = %rdi 73CTX = %rdi
74# 2nd arg 74# 2nd arg
75CTX = %rsi 75INP = %rsi
76# 3rd arg 76# 3rd arg
77NUM_BLKS = %rdx 77NUM_BLKS = %rdx
78 78
@@ -562,7 +562,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
562.endm 562.endm
563 563
564######################################################################## 564########################################################################
565# void sha512_transform_rorx(const void* M, void* D, uint64_t L)# 565# void sha512_transform_rorx(void* D, const void* M, uint64_t L)#
566# Purpose: Updates the SHA512 digest stored at D with the message stored in M. 566# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
567# The size of the message pointed to by M must be an integer multiple of SHA512 567# The size of the message pointed to by M must be an integer multiple of SHA512
568# message blocks. 568# message blocks.
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index fb56855d51f5..e610e29cbc81 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -53,9 +53,9 @@
53 53
54# Virtual Registers 54# Virtual Registers
55# ARG1 55# ARG1
56msg = %rdi 56digest = %rdi
57# ARG2 57# ARG2
58digest = %rsi 58msg = %rsi
59# ARG3 59# ARG3
60msglen = %rdx 60msglen = %rdx
61T1 = %rcx 61T1 = %rcx
@@ -269,7 +269,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
269.endm 269.endm
270 270
271######################################################################## 271########################################################################
272# void sha512_transform_ssse3(const void* M, void* D, u64 L)# 272# void sha512_transform_ssse3(void* D, const void* M, u64 L)#
273# Purpose: Updates the SHA512 digest stored at D with the message stored in M. 273# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
274# The size of the message pointed to by M must be an integer multiple of SHA512 274# The size of the message pointed to by M must be an integer multiple of SHA512
275# message blocks. 275# message blocks.
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0b6af26832bf..d9fa4c1e063f 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -34,205 +34,75 @@
34#include <linux/cryptohash.h> 34#include <linux/cryptohash.h>
35#include <linux/types.h> 35#include <linux/types.h>
36#include <crypto/sha.h> 36#include <crypto/sha.h>
37#include <asm/byteorder.h> 37#include <crypto/sha512_base.h>
38#include <asm/i387.h> 38#include <asm/i387.h>
39#include <asm/xcr.h> 39#include <asm/xcr.h>
40#include <asm/xsave.h> 40#include <asm/xsave.h>
41 41
42#include <linux/string.h> 42#include <linux/string.h>
43 43
44asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest, 44asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
45 u64 rounds); 45 u64 rounds);
46#ifdef CONFIG_AS_AVX 46#ifdef CONFIG_AS_AVX
47asmlinkage void sha512_transform_avx(const char *data, u64 *digest, 47asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
48 u64 rounds); 48 u64 rounds);
49#endif 49#endif
50#ifdef CONFIG_AS_AVX2 50#ifdef CONFIG_AS_AVX2
51asmlinkage void sha512_transform_rorx(const char *data, u64 *digest, 51asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
52 u64 rounds); 52 u64 rounds);
53#endif 53#endif
54 54
55static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64); 55static void (*sha512_transform_asm)(u64 *, const char *, u64);
56
57
58static int sha512_ssse3_init(struct shash_desc *desc)
59{
60 struct sha512_state *sctx = shash_desc_ctx(desc);
61
62 sctx->state[0] = SHA512_H0;
63 sctx->state[1] = SHA512_H1;
64 sctx->state[2] = SHA512_H2;
65 sctx->state[3] = SHA512_H3;
66 sctx->state[4] = SHA512_H4;
67 sctx->state[5] = SHA512_H5;
68 sctx->state[6] = SHA512_H6;
69 sctx->state[7] = SHA512_H7;
70 sctx->count[0] = sctx->count[1] = 0;
71
72 return 0;
73}
74 56
75static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data, 57static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
76 unsigned int len, unsigned int partial) 58 unsigned int len)
77{ 59{
78 struct sha512_state *sctx = shash_desc_ctx(desc); 60 struct sha512_state *sctx = shash_desc_ctx(desc);
79 unsigned int done = 0;
80
81 sctx->count[0] += len;
82 if (sctx->count[0] < len)
83 sctx->count[1]++;
84 61
85 if (partial) { 62 if (!irq_fpu_usable() ||
86 done = SHA512_BLOCK_SIZE - partial; 63 (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
87 memcpy(sctx->buf + partial, data, done); 64 return crypto_sha512_update(desc, data, len);
88 sha512_transform_asm(sctx->buf, sctx->state, 1);
89 }
90
91 if (len - done >= SHA512_BLOCK_SIZE) {
92 const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
93 65
94 sha512_transform_asm(data + done, sctx->state, (u64) rounds); 66 /* make sure casting to sha512_block_fn() is safe */
95 67 BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
96 done += rounds * SHA512_BLOCK_SIZE;
97 }
98 68
99 memcpy(sctx->buf, data + done, len - done); 69 kernel_fpu_begin();
70 sha512_base_do_update(desc, data, len,
71 (sha512_block_fn *)sha512_transform_asm);
72 kernel_fpu_end();
100 73
101 return 0; 74 return 0;
102} 75}
103 76
104static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, 77static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
105 unsigned int len) 78 unsigned int len, u8 *out)
106{ 79{
107 struct sha512_state *sctx = shash_desc_ctx(desc); 80 if (!irq_fpu_usable())
108 unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; 81 return crypto_sha512_finup(desc, data, len, out);
109 int res;
110
111 /* Handle the fast case right here */
112 if (partial + len < SHA512_BLOCK_SIZE) {
113 sctx->count[0] += len;
114 if (sctx->count[0] < len)
115 sctx->count[1]++;
116 memcpy(sctx->buf + partial, data, len);
117
118 return 0;
119 }
120 82
121 if (!irq_fpu_usable()) { 83 kernel_fpu_begin();
122 res = crypto_sha512_update(desc, data, len); 84 if (len)
123 } else { 85 sha512_base_do_update(desc, data, len,
124 kernel_fpu_begin(); 86 (sha512_block_fn *)sha512_transform_asm);
125 res = __sha512_ssse3_update(desc, data, len, partial); 87 sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm);
126 kernel_fpu_end(); 88 kernel_fpu_end();
127 }
128 89
129 return res; 90 return sha512_base_finish(desc, out);
130} 91}
131 92
132
133/* Add padding and return the message digest. */ 93/* Add padding and return the message digest. */
134static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) 94static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
135{ 95{
136 struct sha512_state *sctx = shash_desc_ctx(desc); 96 return sha512_ssse3_finup(desc, NULL, 0, out);
137 unsigned int i, index, padlen;
138 __be64 *dst = (__be64 *)out;
139 __be64 bits[2];
140 static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
141
142 /* save number of bits */
143 bits[1] = cpu_to_be64(sctx->count[0] << 3);
144 bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
145
146 /* Pad out to 112 mod 128 and append length */
147 index = sctx->count[0] & 0x7f;
148 padlen = (index < 112) ? (112 - index) : ((128+112) - index);
149
150 if (!irq_fpu_usable()) {
151 crypto_sha512_update(desc, padding, padlen);
152 crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
153 } else {
154 kernel_fpu_begin();
155 /* We need to fill a whole block for __sha512_ssse3_update() */
156 if (padlen <= 112) {
157 sctx->count[0] += padlen;
158 if (sctx->count[0] < padlen)
159 sctx->count[1]++;
160 memcpy(sctx->buf + index, padding, padlen);
161 } else {
162 __sha512_ssse3_update(desc, padding, padlen, index);
163 }
164 __sha512_ssse3_update(desc, (const u8 *)&bits,
165 sizeof(bits), 112);
166 kernel_fpu_end();
167 }
168
169 /* Store state in digest */
170 for (i = 0; i < 8; i++)
171 dst[i] = cpu_to_be64(sctx->state[i]);
172
173 /* Wipe context */
174 memset(sctx, 0, sizeof(*sctx));
175
176 return 0;
177}
178
179static int sha512_ssse3_export(struct shash_desc *desc, void *out)
180{
181 struct sha512_state *sctx = shash_desc_ctx(desc);
182
183 memcpy(out, sctx, sizeof(*sctx));
184
185 return 0;
186}
187
188static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
189{
190 struct sha512_state *sctx = shash_desc_ctx(desc);
191
192 memcpy(sctx, in, sizeof(*sctx));
193
194 return 0;
195}
196
197static int sha384_ssse3_init(struct shash_desc *desc)
198{
199 struct sha512_state *sctx = shash_desc_ctx(desc);
200
201 sctx->state[0] = SHA384_H0;
202 sctx->state[1] = SHA384_H1;
203 sctx->state[2] = SHA384_H2;
204 sctx->state[3] = SHA384_H3;
205 sctx->state[4] = SHA384_H4;
206 sctx->state[5] = SHA384_H5;
207 sctx->state[6] = SHA384_H6;
208 sctx->state[7] = SHA384_H7;
209
210 sctx->count[0] = sctx->count[1] = 0;
211
212 return 0;
213}
214
215static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
216{
217 u8 D[SHA512_DIGEST_SIZE];
218
219 sha512_ssse3_final(desc, D);
220
221 memcpy(hash, D, SHA384_DIGEST_SIZE);
222 memzero_explicit(D, SHA512_DIGEST_SIZE);
223
224 return 0;
225} 97}
226 98
227static struct shash_alg algs[] = { { 99static struct shash_alg algs[] = { {
228 .digestsize = SHA512_DIGEST_SIZE, 100 .digestsize = SHA512_DIGEST_SIZE,
229 .init = sha512_ssse3_init, 101 .init = sha512_base_init,
230 .update = sha512_ssse3_update, 102 .update = sha512_ssse3_update,
231 .final = sha512_ssse3_final, 103 .final = sha512_ssse3_final,
232 .export = sha512_ssse3_export, 104 .finup = sha512_ssse3_finup,
233 .import = sha512_ssse3_import,
234 .descsize = sizeof(struct sha512_state), 105 .descsize = sizeof(struct sha512_state),
235 .statesize = sizeof(struct sha512_state),
236 .base = { 106 .base = {
237 .cra_name = "sha512", 107 .cra_name = "sha512",
238 .cra_driver_name = "sha512-ssse3", 108 .cra_driver_name = "sha512-ssse3",
@@ -243,13 +113,11 @@ static struct shash_alg algs[] = { {
243 } 113 }
244}, { 114}, {
245 .digestsize = SHA384_DIGEST_SIZE, 115 .digestsize = SHA384_DIGEST_SIZE,
246 .init = sha384_ssse3_init, 116 .init = sha384_base_init,
247 .update = sha512_ssse3_update, 117 .update = sha512_ssse3_update,
248 .final = sha384_ssse3_final, 118 .final = sha512_ssse3_final,
249 .export = sha512_ssse3_export, 119 .finup = sha512_ssse3_finup,
250 .import = sha512_ssse3_import,
251 .descsize = sizeof(struct sha512_state), 120 .descsize = sizeof(struct sha512_state),
252 .statesize = sizeof(struct sha512_state),
253 .base = { 121 .base = {
254 .cra_name = "sha384", 122 .cra_name = "sha384",
255 .cra_driver_name = "sha384-ssse3", 123 .cra_driver_name = "sha384-ssse3",
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index a039d21986a2..a350c990dc86 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk)
264 movq R1, 8(%rsi) 264 movq R1, 8(%rsi)
265 265
266 popq R1 266 popq R1
267 movq $1,%rax 267 movl $1,%eax
268 ret 268 ret
269ENDPROC(twofish_enc_blk) 269ENDPROC(twofish_enc_blk)
270 270
@@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk)
316 movq R1, 8(%rsi) 316 movq R1, 8(%rsi)
317 317
318 popq R1 318 popq R1
319 movq $1,%rax 319 movl $1,%eax
320 ret 320 ret
321ENDPROC(twofish_dec_blk) 321ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 1ac531ea9bcc..b5e2d5651851 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -340,7 +340,8 @@ static struct crypto_alg twofish_algs[10] = { {
340 .cra_name = "__ecb-twofish-avx", 340 .cra_name = "__ecb-twofish-avx",
341 .cra_driver_name = "__driver-ecb-twofish-avx", 341 .cra_driver_name = "__driver-ecb-twofish-avx",
342 .cra_priority = 0, 342 .cra_priority = 0,
343 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 343 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
344 CRYPTO_ALG_INTERNAL,
344 .cra_blocksize = TF_BLOCK_SIZE, 345 .cra_blocksize = TF_BLOCK_SIZE,
345 .cra_ctxsize = sizeof(struct twofish_ctx), 346 .cra_ctxsize = sizeof(struct twofish_ctx),
346 .cra_alignmask = 0, 347 .cra_alignmask = 0,
@@ -359,7 +360,8 @@ static struct crypto_alg twofish_algs[10] = { {
359 .cra_name = "__cbc-twofish-avx", 360 .cra_name = "__cbc-twofish-avx",
360 .cra_driver_name = "__driver-cbc-twofish-avx", 361 .cra_driver_name = "__driver-cbc-twofish-avx",
361 .cra_priority = 0, 362 .cra_priority = 0,
362 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 363 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
364 CRYPTO_ALG_INTERNAL,
363 .cra_blocksize = TF_BLOCK_SIZE, 365 .cra_blocksize = TF_BLOCK_SIZE,
364 .cra_ctxsize = sizeof(struct twofish_ctx), 366 .cra_ctxsize = sizeof(struct twofish_ctx),
365 .cra_alignmask = 0, 367 .cra_alignmask = 0,
@@ -378,7 +380,8 @@ static struct crypto_alg twofish_algs[10] = { {
378 .cra_name = "__ctr-twofish-avx", 380 .cra_name = "__ctr-twofish-avx",
379 .cra_driver_name = "__driver-ctr-twofish-avx", 381 .cra_driver_name = "__driver-ctr-twofish-avx",
380 .cra_priority = 0, 382 .cra_priority = 0,
381 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 383 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
384 CRYPTO_ALG_INTERNAL,
382 .cra_blocksize = 1, 385 .cra_blocksize = 1,
383 .cra_ctxsize = sizeof(struct twofish_ctx), 386 .cra_ctxsize = sizeof(struct twofish_ctx),
384 .cra_alignmask = 0, 387 .cra_alignmask = 0,
@@ -398,7 +401,8 @@ static struct crypto_alg twofish_algs[10] = { {
398 .cra_name = "__lrw-twofish-avx", 401 .cra_name = "__lrw-twofish-avx",
399 .cra_driver_name = "__driver-lrw-twofish-avx", 402 .cra_driver_name = "__driver-lrw-twofish-avx",
400 .cra_priority = 0, 403 .cra_priority = 0,
401 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 404 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
405 CRYPTO_ALG_INTERNAL,
402 .cra_blocksize = TF_BLOCK_SIZE, 406 .cra_blocksize = TF_BLOCK_SIZE,
403 .cra_ctxsize = sizeof(struct twofish_lrw_ctx), 407 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
404 .cra_alignmask = 0, 408 .cra_alignmask = 0,
@@ -421,7 +425,8 @@ static struct crypto_alg twofish_algs[10] = { {
421 .cra_name = "__xts-twofish-avx", 425 .cra_name = "__xts-twofish-avx",
422 .cra_driver_name = "__driver-xts-twofish-avx", 426 .cra_driver_name = "__driver-xts-twofish-avx",
423 .cra_priority = 0, 427 .cra_priority = 0,
424 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 428 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
429 CRYPTO_ALG_INTERNAL,
425 .cra_blocksize = TF_BLOCK_SIZE, 430 .cra_blocksize = TF_BLOCK_SIZE,
426 .cra_ctxsize = sizeof(struct twofish_xts_ctx), 431 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
427 .cra_alignmask = 0, 432 .cra_alignmask = 0,
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e785b422b766..bb635c641869 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,6 @@
3# 3#
4 4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o 5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
6obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
7 6
8obj-$(CONFIG_IA32_AOUT) += ia32_aout.o 7obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
9 8
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index d0165c9a2932..c81d35e6c7f1 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
161} 161}
162 162
163static int ia32_restore_sigcontext(struct pt_regs *regs, 163static int ia32_restore_sigcontext(struct pt_regs *regs,
164 struct sigcontext_ia32 __user *sc, 164 struct sigcontext_ia32 __user *sc)
165 unsigned int *pax)
166{ 165{
167 unsigned int tmpflags, err = 0; 166 unsigned int tmpflags, err = 0;
168 void __user *buf; 167 void __user *buf;
@@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
184 RELOAD_SEG(es); 183 RELOAD_SEG(es);
185 184
186 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 185 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
187 COPY(dx); COPY(cx); COPY(ip); 186 COPY(dx); COPY(cx); COPY(ip); COPY(ax);
188 /* Don't touch extended registers */ 187 /* Don't touch extended registers */
189 188
190 COPY_SEG_CPL3(cs); 189 COPY_SEG_CPL3(cs);
@@ -197,12 +196,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
197 196
198 get_user_ex(tmp, &sc->fpstate); 197 get_user_ex(tmp, &sc->fpstate);
199 buf = compat_ptr(tmp); 198 buf = compat_ptr(tmp);
200
201 get_user_ex(*pax, &sc->ax);
202 } get_user_catch(err); 199 } get_user_catch(err);
203 200
204 err |= restore_xstate_sig(buf, 1); 201 err |= restore_xstate_sig(buf, 1);
205 202
203 force_iret();
204
206 return err; 205 return err;
207} 206}
208 207
@@ -211,7 +210,6 @@ asmlinkage long sys32_sigreturn(void)
211 struct pt_regs *regs = current_pt_regs(); 210 struct pt_regs *regs = current_pt_regs();
212 struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); 211 struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
213 sigset_t set; 212 sigset_t set;
214 unsigned int ax;
215 213
216 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 214 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
217 goto badframe; 215 goto badframe;
@@ -224,9 +222,9 @@ asmlinkage long sys32_sigreturn(void)
224 222
225 set_current_blocked(&set); 223 set_current_blocked(&set);
226 224
227 if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) 225 if (ia32_restore_sigcontext(regs, &frame->sc))
228 goto badframe; 226 goto badframe;
229 return ax; 227 return regs->ax;
230 228
231badframe: 229badframe:
232 signal_fault(regs, frame, "32bit sigreturn"); 230 signal_fault(regs, frame, "32bit sigreturn");
@@ -238,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void)
238 struct pt_regs *regs = current_pt_regs(); 236 struct pt_regs *regs = current_pt_regs();
239 struct rt_sigframe_ia32 __user *frame; 237 struct rt_sigframe_ia32 __user *frame;
240 sigset_t set; 238 sigset_t set;
241 unsigned int ax;
242 239
243 frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); 240 frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
244 241
@@ -249,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void)
249 246
250 set_current_blocked(&set); 247 set_current_blocked(&set);
251 248
252 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 249 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
253 goto badframe; 250 goto badframe;
254 251
255 if (compat_restore_altstack(&frame->uc.uc_stack)) 252 if (compat_restore_altstack(&frame->uc.uc_stack))
256 goto badframe; 253 goto badframe;
257 254
258 return ax; 255 return regs->ax;
259 256
260badframe: 257badframe:
261 signal_fault(regs, frame, "32bit rt sigreturn"); 258 signal_fault(regs, frame, "32bit rt sigreturn");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebcab4ada..a821b1cd4fa7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -30,24 +30,13 @@
30 30
31 .section .entry.text, "ax" 31 .section .entry.text, "ax"
32 32
33 .macro IA32_ARG_FIXUP noebp=0 33 /* clobbers %rax */
34 movl %edi,%r8d 34 .macro CLEAR_RREGS _r9=rax
35 .if \noebp
36 .else
37 movl %ebp,%r9d
38 .endif
39 xchg %ecx,%esi
40 movl %ebx,%edi
41 movl %edx,%edx /* zero extension */
42 .endm
43
44 /* clobbers %eax */
45 .macro CLEAR_RREGS offset=0, _r9=rax
46 xorl %eax,%eax 35 xorl %eax,%eax
47 movq %rax,\offset+R11(%rsp) 36 movq %rax,R11(%rsp)
48 movq %rax,\offset+R10(%rsp) 37 movq %rax,R10(%rsp)
49 movq %\_r9,\offset+R9(%rsp) 38 movq %\_r9,R9(%rsp)
50 movq %rax,\offset+R8(%rsp) 39 movq %rax,R8(%rsp)
51 .endm 40 .endm
52 41
53 /* 42 /*
@@ -60,14 +49,14 @@
60 * If it's -1 to make us punt the syscall, then (u32)-1 is still 49 * If it's -1 to make us punt the syscall, then (u32)-1 is still
61 * an appropriately invalid value. 50 * an appropriately invalid value.
62 */ 51 */
63 .macro LOAD_ARGS32 offset, _r9=0 52 .macro LOAD_ARGS32 _r9=0
64 .if \_r9 53 .if \_r9
65 movl \offset+16(%rsp),%r9d 54 movl R9(%rsp),%r9d
66 .endif 55 .endif
67 movl \offset+40(%rsp),%ecx 56 movl RCX(%rsp),%ecx
68 movl \offset+48(%rsp),%edx 57 movl RDX(%rsp),%edx
69 movl \offset+56(%rsp),%esi 58 movl RSI(%rsp),%esi
70 movl \offset+64(%rsp),%edi 59 movl RDI(%rsp),%edi
71 movl %eax,%eax /* zero extension */ 60 movl %eax,%eax /* zero extension */
72 .endm 61 .endm
73 62
@@ -99,54 +88,69 @@ ENDPROC(native_irq_enable_sysexit)
99/* 88/*
100 * 32bit SYSENTER instruction entry. 89 * 32bit SYSENTER instruction entry.
101 * 90 *
91 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
92 * IF and VM in rflags are cleared (IOW: interrupts are off).
93 * SYSENTER does not save anything on the stack,
94 * and does not save old rip (!!!) and rflags.
95 *
102 * Arguments: 96 * Arguments:
103 * %eax System call number. 97 * eax system call number
104 * %ebx Arg1 98 * ebx arg1
105 * %ecx Arg2 99 * ecx arg2
106 * %edx Arg3 100 * edx arg3
107 * %esi Arg4 101 * esi arg4
108 * %edi Arg5 102 * edi arg5
109 * %ebp user stack 103 * ebp user stack
110 * 0(%ebp) Arg6 104 * 0(%ebp) arg6
111 * 105 *
112 * Interrupts off.
113 *
114 * This is purely a fast path. For anything complicated we use the int 0x80 106 * This is purely a fast path. For anything complicated we use the int 0x80
115 * path below. Set up a complete hardware stack frame to share code 107 * path below. We set up a complete hardware stack frame to share code
116 * with the int 0x80 path. 108 * with the int 0x80 path.
117 */ 109 */
118ENTRY(ia32_sysenter_target) 110ENTRY(ia32_sysenter_target)
119 CFI_STARTPROC32 simple 111 CFI_STARTPROC32 simple
120 CFI_SIGNAL_FRAME 112 CFI_SIGNAL_FRAME
121 CFI_DEF_CFA rsp,0 113 CFI_DEF_CFA rsp,0
122 CFI_REGISTER rsp,rbp 114 CFI_REGISTER rsp,rbp
123 SWAPGS_UNSAFE_STACK 115
124 movq PER_CPU_VAR(kernel_stack), %rsp
125 addq $(KERNEL_STACK_OFFSET),%rsp
126 /* 116 /*
127 * No need to follow this irqs on/off section: the syscall 117 * Interrupts are off on entry.
128 * disabled irqs, here we enable it straight after entry: 118 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
119 * it is too small to ever cause noticeable irq latency.
129 */ 120 */
121 SWAPGS_UNSAFE_STACK
122 movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
130 ENABLE_INTERRUPTS(CLBR_NONE) 123 ENABLE_INTERRUPTS(CLBR_NONE)
131 movl %ebp,%ebp /* zero extension */ 124
132 pushq_cfi $__USER32_DS 125 /* Zero-extending 32-bit regs, do not remove */
133 /*CFI_REL_OFFSET ss,0*/ 126 movl %ebp, %ebp
134 pushq_cfi %rbp
135 CFI_REL_OFFSET rsp,0
136 pushfq_cfi
137 /*CFI_REL_OFFSET rflags,0*/
138 movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
139 CFI_REGISTER rip,r10
140 pushq_cfi $__USER32_CS
141 /*CFI_REL_OFFSET cs,0*/
142 movl %eax, %eax 127 movl %eax, %eax
143 pushq_cfi %r10 128
144 CFI_REL_OFFSET rip,0 129 movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
145 pushq_cfi %rax 130 CFI_REGISTER rip,r10
131
132 /* Construct struct pt_regs on stack */
133 pushq_cfi $__USER32_DS /* pt_regs->ss */
134 pushq_cfi %rbp /* pt_regs->sp */
135 CFI_REL_OFFSET rsp,0
136 pushfq_cfi /* pt_regs->flags */
137 pushq_cfi $__USER32_CS /* pt_regs->cs */
138 pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */
139 CFI_REL_OFFSET rip,0
140 pushq_cfi_reg rax /* pt_regs->orig_ax */
141 pushq_cfi_reg rdi /* pt_regs->di */
142 pushq_cfi_reg rsi /* pt_regs->si */
143 pushq_cfi_reg rdx /* pt_regs->dx */
144 pushq_cfi_reg rcx /* pt_regs->cx */
145 pushq_cfi_reg rax /* pt_regs->ax */
146 cld 146 cld
147 SAVE_ARGS 0,1,0 147 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
148 /* no need to do an access_ok check here because rbp has been 148 CFI_ADJUST_CFA_OFFSET 10*8
149 32bit zero extended */ 149
150 /*
151 * no need to do an access_ok check here because rbp has been
152 * 32bit zero extended
153 */
150 ASM_STAC 154 ASM_STAC
1511: movl (%rbp),%ebp 1551: movl (%rbp),%ebp
152 _ASM_EXTABLE(1b,ia32_badarg) 156 _ASM_EXTABLE(1b,ia32_badarg)
@@ -157,42 +161,80 @@ ENTRY(ia32_sysenter_target)
157 * ourselves. To save a few cycles, we can check whether 161 * ourselves. To save a few cycles, we can check whether
158 * NT was set instead of doing an unconditional popfq. 162 * NT was set instead of doing an unconditional popfq.
159 */ 163 */
160 testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) 164 testl $X86_EFLAGS_NT,EFLAGS(%rsp)
161 jnz sysenter_fix_flags 165 jnz sysenter_fix_flags
162sysenter_flags_fixed: 166sysenter_flags_fixed:
163 167
164 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 168 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
165 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 169 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
166 CFI_REMEMBER_STATE 170 CFI_REMEMBER_STATE
167 jnz sysenter_tracesys 171 jnz sysenter_tracesys
168 cmpq $(IA32_NR_syscalls-1),%rax 172 cmpq $(IA32_NR_syscalls-1),%rax
169 ja ia32_badsys 173 ja ia32_badsys
170sysenter_do_call: 174sysenter_do_call:
171 IA32_ARG_FIXUP 175 /* 32bit syscall -> 64bit C ABI argument conversion */
176 movl %edi,%r8d /* arg5 */
177 movl %ebp,%r9d /* arg6 */
178 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
179 movl %ebx,%edi /* arg1 */
180 movl %edx,%edx /* arg3 (zero extension) */
172sysenter_dispatch: 181sysenter_dispatch:
173 call *ia32_sys_call_table(,%rax,8) 182 call *ia32_sys_call_table(,%rax,8)
174 movq %rax,RAX-ARGOFFSET(%rsp) 183 movq %rax,RAX(%rsp)
175 DISABLE_INTERRUPTS(CLBR_NONE) 184 DISABLE_INTERRUPTS(CLBR_NONE)
176 TRACE_IRQS_OFF 185 TRACE_IRQS_OFF
177 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 186 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
178 jnz sysexit_audit 187 jnz sysexit_audit
179sysexit_from_sys_call: 188sysexit_from_sys_call:
180 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 189 /*
181 /* clear IF, that popfq doesn't enable interrupts early */ 190 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
182 andl $~0x200,EFLAGS-ARGOFFSET(%rsp) 191 * NMI between STI and SYSEXIT has poorly specified behavior,
183 movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ 192 * and and NMI followed by an IRQ with usergs is fatal. So
184 CFI_REGISTER rip,rdx 193 * we just pretend we're using SYSEXIT but we really use
185 RESTORE_ARGS 0,24,0,0,0,0 194 * SYSRETL instead.
195 *
196 * This code path is still called 'sysexit' because it pairs
197 * with 'sysenter' and it uses the SYSENTER calling convention.
198 */
199 andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
200 movl RIP(%rsp),%ecx /* User %eip */
201 CFI_REGISTER rip,rcx
202 RESTORE_RSI_RDI
203 xorl %edx,%edx /* avoid info leaks */
186 xorq %r8,%r8 204 xorq %r8,%r8
187 xorq %r9,%r9 205 xorq %r9,%r9
188 xorq %r10,%r10 206 xorq %r10,%r10
189 xorq %r11,%r11 207 movl EFLAGS(%rsp),%r11d /* User eflags */
190 popfq_cfi
191 /*CFI_RESTORE rflags*/ 208 /*CFI_RESTORE rflags*/
192 popq_cfi %rcx /* User %esp */
193 CFI_REGISTER rsp,rcx
194 TRACE_IRQS_ON 209 TRACE_IRQS_ON
195 ENABLE_INTERRUPTS_SYSEXIT32 210
211 /*
212 * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
213 * since it avoids a dicey window with interrupts enabled.
214 */
215 movl RSP(%rsp),%esp
216
217 /*
218 * USERGS_SYSRET32 does:
219 * gsbase = user's gs base
220 * eip = ecx
221 * rflags = r11
222 * cs = __USER32_CS
223 * ss = __USER_DS
224 *
225 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
226 *
227 * pop %ebp
228 * pop %edx
229 * pop %ecx
230 *
231 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
232 * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
233 * address (already known to user code), and R12-R15 are
234 * callee-saved and therefore don't contain any interesting
235 * kernel data.
236 */
237 USERGS_SYSRET32
196 238
197 CFI_RESTORE_STATE 239 CFI_RESTORE_STATE
198 240
@@ -205,18 +247,18 @@ sysexit_from_sys_call:
205 movl %ebx,%esi /* 2nd arg: 1st syscall arg */ 247 movl %ebx,%esi /* 2nd arg: 1st syscall arg */
206 movl %eax,%edi /* 1st arg: syscall number */ 248 movl %eax,%edi /* 1st arg: syscall number */
207 call __audit_syscall_entry 249 call __audit_syscall_entry
208 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ 250 movl RAX(%rsp),%eax /* reload syscall number */
209 cmpq $(IA32_NR_syscalls-1),%rax 251 cmpq $(IA32_NR_syscalls-1),%rax
210 ja ia32_badsys 252 ja ia32_badsys
211 movl %ebx,%edi /* reload 1st syscall arg */ 253 movl %ebx,%edi /* reload 1st syscall arg */
212 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ 254 movl RCX(%rsp),%esi /* reload 2nd syscall arg */
213 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ 255 movl RDX(%rsp),%edx /* reload 3rd syscall arg */
214 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ 256 movl RSI(%rsp),%ecx /* reload 4th syscall arg */
215 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ 257 movl RDI(%rsp),%r8d /* reload 5th syscall arg */
216 .endm 258 .endm
217 259
218 .macro auditsys_exit exit 260 .macro auditsys_exit exit
219 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 261 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
220 jnz ia32_ret_from_sys_call 262 jnz ia32_ret_from_sys_call
221 TRACE_IRQS_ON 263 TRACE_IRQS_ON
222 ENABLE_INTERRUPTS(CLBR_NONE) 264 ENABLE_INTERRUPTS(CLBR_NONE)
@@ -227,13 +269,13 @@ sysexit_from_sys_call:
2271: setbe %al /* 1 if error, 0 if not */ 2691: setbe %al /* 1 if error, 0 if not */
228 movzbl %al,%edi /* zero-extend that into %edi */ 270 movzbl %al,%edi /* zero-extend that into %edi */
229 call __audit_syscall_exit 271 call __audit_syscall_exit
230 movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ 272 movq RAX(%rsp),%rax /* reload syscall return value */
231 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 273 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
232 DISABLE_INTERRUPTS(CLBR_NONE) 274 DISABLE_INTERRUPTS(CLBR_NONE)
233 TRACE_IRQS_OFF 275 TRACE_IRQS_OFF
234 testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 276 testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
235 jz \exit 277 jz \exit
236 CLEAR_RREGS -ARGOFFSET 278 CLEAR_RREGS
237 jmp int_with_check 279 jmp int_with_check
238 .endm 280 .endm
239 281
@@ -253,16 +295,16 @@ sysenter_fix_flags:
253 295
254sysenter_tracesys: 296sysenter_tracesys:
255#ifdef CONFIG_AUDITSYSCALL 297#ifdef CONFIG_AUDITSYSCALL
256 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 298 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
257 jz sysenter_auditsys 299 jz sysenter_auditsys
258#endif 300#endif
259 SAVE_REST 301 SAVE_EXTRA_REGS
260 CLEAR_RREGS 302 CLEAR_RREGS
261 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ 303 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
262 movq %rsp,%rdi /* &pt_regs -> arg1 */ 304 movq %rsp,%rdi /* &pt_regs -> arg1 */
263 call syscall_trace_enter 305 call syscall_trace_enter
264 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 306 LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
265 RESTORE_REST 307 RESTORE_EXTRA_REGS
266 cmpq $(IA32_NR_syscalls-1),%rax 308 cmpq $(IA32_NR_syscalls-1),%rax
267 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ 309 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
268 jmp sysenter_do_call 310 jmp sysenter_do_call
@@ -272,94 +314,128 @@ ENDPROC(ia32_sysenter_target)
272/* 314/*
273 * 32bit SYSCALL instruction entry. 315 * 32bit SYSCALL instruction entry.
274 * 316 *
317 * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
318 * then loads new ss, cs, and rip from previously programmed MSRs.
319 * rflags gets masked by a value from another MSR (so CLD and CLAC
320 * are not needed). SYSCALL does not save anything on the stack
321 * and does not change rsp.
322 *
323 * Note: rflags saving+masking-with-MSR happens only in Long mode
324 * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
325 * Don't get confused: rflags saving+masking depends on Long Mode Active bit
326 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
327 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
328 *
275 * Arguments: 329 * Arguments:
276 * %eax System call number. 330 * eax system call number
277 * %ebx Arg1 331 * ecx return address
278 * %ecx return EIP 332 * ebx arg1
279 * %edx Arg3 333 * ebp arg2 (note: not saved in the stack frame, should not be touched)
280 * %esi Arg4 334 * edx arg3
281 * %edi Arg5 335 * esi arg4
282 * %ebp Arg2 [note: not saved in the stack frame, should not be touched] 336 * edi arg5
283 * %esp user stack 337 * esp user stack
284 * 0(%esp) Arg6 338 * 0(%esp) arg6
285 * 339 *
286 * Interrupts off.
287 *
288 * This is purely a fast path. For anything complicated we use the int 0x80 340 * This is purely a fast path. For anything complicated we use the int 0x80
289 * path below. Set up a complete hardware stack frame to share code 341 * path below. We set up a complete hardware stack frame to share code
290 * with the int 0x80 path. 342 * with the int 0x80 path.
291 */ 343 */
292ENTRY(ia32_cstar_target) 344ENTRY(ia32_cstar_target)
293 CFI_STARTPROC32 simple 345 CFI_STARTPROC32 simple
294 CFI_SIGNAL_FRAME 346 CFI_SIGNAL_FRAME
295 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 347 CFI_DEF_CFA rsp,0
296 CFI_REGISTER rip,rcx 348 CFI_REGISTER rip,rcx
297 /*CFI_REGISTER rflags,r11*/ 349 /*CFI_REGISTER rflags,r11*/
350
351 /*
352 * Interrupts are off on entry.
353 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
354 * it is too small to ever cause noticeable irq latency.
355 */
298 SWAPGS_UNSAFE_STACK 356 SWAPGS_UNSAFE_STACK
299 movl %esp,%r8d 357 movl %esp,%r8d
300 CFI_REGISTER rsp,r8 358 CFI_REGISTER rsp,r8
301 movq PER_CPU_VAR(kernel_stack),%rsp 359 movq PER_CPU_VAR(kernel_stack),%rsp
302 /*
303 * No need to follow this irqs on/off section: the syscall
304 * disabled irqs and here we enable it straight after entry:
305 */
306 ENABLE_INTERRUPTS(CLBR_NONE) 360 ENABLE_INTERRUPTS(CLBR_NONE)
307 SAVE_ARGS 8,0,0 361
308 movl %eax,%eax /* zero extension */ 362 /* Zero-extending 32-bit regs, do not remove */
309 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 363 movl %eax,%eax
310 movq %rcx,RIP-ARGOFFSET(%rsp) 364
311 CFI_REL_OFFSET rip,RIP-ARGOFFSET 365 /* Construct struct pt_regs on stack */
312 movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ 366 pushq_cfi $__USER32_DS /* pt_regs->ss */
367 pushq_cfi %r8 /* pt_regs->sp */
368 CFI_REL_OFFSET rsp,0
369 pushq_cfi %r11 /* pt_regs->flags */
370 pushq_cfi $__USER32_CS /* pt_regs->cs */
371 pushq_cfi %rcx /* pt_regs->ip */
372 CFI_REL_OFFSET rip,0
373 pushq_cfi_reg rax /* pt_regs->orig_ax */
374 pushq_cfi_reg rdi /* pt_regs->di */
375 pushq_cfi_reg rsi /* pt_regs->si */
376 pushq_cfi_reg rdx /* pt_regs->dx */
377 pushq_cfi_reg rbp /* pt_regs->cx */
313 movl %ebp,%ecx 378 movl %ebp,%ecx
314 movq $__USER32_CS,CS-ARGOFFSET(%rsp) 379 pushq_cfi_reg rax /* pt_regs->ax */
315 movq $__USER32_DS,SS-ARGOFFSET(%rsp) 380 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
316 movq %r11,EFLAGS-ARGOFFSET(%rsp) 381 CFI_ADJUST_CFA_OFFSET 10*8
317 /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ 382
318 movq %r8,RSP-ARGOFFSET(%rsp) 383 /*
319 CFI_REL_OFFSET rsp,RSP-ARGOFFSET 384 * no need to do an access_ok check here because r8 has been
320 /* no need to do an access_ok check here because r8 has been 385 * 32bit zero extended
321 32bit zero extended */ 386 */
322 /* hardware stack frame is complete now */
323 ASM_STAC 387 ASM_STAC
3241: movl (%r8),%r9d 3881: movl (%r8),%r9d
325 _ASM_EXTABLE(1b,ia32_badarg) 389 _ASM_EXTABLE(1b,ia32_badarg)
326 ASM_CLAC 390 ASM_CLAC
327 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 391 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
328 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 392 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
329 CFI_REMEMBER_STATE 393 CFI_REMEMBER_STATE
330 jnz cstar_tracesys 394 jnz cstar_tracesys
331 cmpq $IA32_NR_syscalls-1,%rax 395 cmpq $IA32_NR_syscalls-1,%rax
332 ja ia32_badsys 396 ja ia32_badsys
333cstar_do_call: 397cstar_do_call:
334 IA32_ARG_FIXUP 1 398 /* 32bit syscall -> 64bit C ABI argument conversion */
399 movl %edi,%r8d /* arg5 */
400 /* r9 already loaded */ /* arg6 */
401 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
402 movl %ebx,%edi /* arg1 */
403 movl %edx,%edx /* arg3 (zero extension) */
335cstar_dispatch: 404cstar_dispatch:
336 call *ia32_sys_call_table(,%rax,8) 405 call *ia32_sys_call_table(,%rax,8)
337 movq %rax,RAX-ARGOFFSET(%rsp) 406 movq %rax,RAX(%rsp)
338 DISABLE_INTERRUPTS(CLBR_NONE) 407 DISABLE_INTERRUPTS(CLBR_NONE)
339 TRACE_IRQS_OFF 408 TRACE_IRQS_OFF
340 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 409 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
341 jnz sysretl_audit 410 jnz sysretl_audit
342sysretl_from_sys_call: 411sysretl_from_sys_call:
343 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 412 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
344 RESTORE_ARGS 0,-ARG_SKIP,0,0,0 413 RESTORE_RSI_RDI_RDX
345 movl RIP-ARGOFFSET(%rsp),%ecx 414 movl RIP(%rsp),%ecx
346 CFI_REGISTER rip,rcx 415 CFI_REGISTER rip,rcx
347 movl EFLAGS-ARGOFFSET(%rsp),%r11d 416 movl EFLAGS(%rsp),%r11d
348 /*CFI_REGISTER rflags,r11*/ 417 /*CFI_REGISTER rflags,r11*/
349 xorq %r10,%r10 418 xorq %r10,%r10
350 xorq %r9,%r9 419 xorq %r9,%r9
351 xorq %r8,%r8 420 xorq %r8,%r8
352 TRACE_IRQS_ON 421 TRACE_IRQS_ON
353 movl RSP-ARGOFFSET(%rsp),%esp 422 movl RSP(%rsp),%esp
354 CFI_RESTORE rsp 423 CFI_RESTORE rsp
424 /*
425 * 64bit->32bit SYSRET restores eip from ecx,
426 * eflags from r11 (but RF and VM bits are forced to 0),
427 * cs and ss are loaded from MSRs.
428 * (Note: 32bit->32bit SYSRET is different: since r11
429 * does not exist, it merely sets eflags.IF=1).
430 */
355 USERGS_SYSRET32 431 USERGS_SYSRET32
356 432
357#ifdef CONFIG_AUDITSYSCALL 433#ifdef CONFIG_AUDITSYSCALL
358cstar_auditsys: 434cstar_auditsys:
359 CFI_RESTORE_STATE 435 CFI_RESTORE_STATE
360 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ 436 movl %r9d,R9(%rsp) /* register to be clobbered by call */
361 auditsys_entry_common 437 auditsys_entry_common
362 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ 438 movl R9(%rsp),%r9d /* reload 6th syscall arg */
363 jmp cstar_dispatch 439 jmp cstar_dispatch
364 440
365sysretl_audit: 441sysretl_audit:
@@ -368,17 +444,17 @@ sysretl_audit:
368 444
369cstar_tracesys: 445cstar_tracesys:
370#ifdef CONFIG_AUDITSYSCALL 446#ifdef CONFIG_AUDITSYSCALL
371 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 447 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
372 jz cstar_auditsys 448 jz cstar_auditsys
373#endif 449#endif
374 xchgl %r9d,%ebp 450 xchgl %r9d,%ebp
375 SAVE_REST 451 SAVE_EXTRA_REGS
376 CLEAR_RREGS 0, r9 452 CLEAR_RREGS r9
377 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 453 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
378 movq %rsp,%rdi /* &pt_regs -> arg1 */ 454 movq %rsp,%rdi /* &pt_regs -> arg1 */
379 call syscall_trace_enter 455 call syscall_trace_enter
380 LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ 456 LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */
381 RESTORE_REST 457 RESTORE_EXTRA_REGS
382 xchgl %ebp,%r9d 458 xchgl %ebp,%r9d
383 cmpq $(IA32_NR_syscalls-1),%rax 459 cmpq $(IA32_NR_syscalls-1),%rax
384 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ 460 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -391,78 +467,94 @@ ia32_badarg:
391 jmp ia32_sysret 467 jmp ia32_sysret
392 CFI_ENDPROC 468 CFI_ENDPROC
393 469
394/* 470/*
395 * Emulated IA32 system calls via int 0x80. 471 * Emulated IA32 system calls via int 0x80.
396 * 472 *
397 * Arguments: 473 * Arguments:
398 * %eax System call number. 474 * eax system call number
399 * %ebx Arg1 475 * ebx arg1
400 * %ecx Arg2 476 * ecx arg2
401 * %edx Arg3 477 * edx arg3
402 * %esi Arg4 478 * esi arg4
403 * %edi Arg5 479 * edi arg5
404 * %ebp Arg6 [note: not saved in the stack frame, should not be touched] 480 * ebp arg6 (note: not saved in the stack frame, should not be touched)
405 * 481 *
406 * Notes: 482 * Notes:
407 * Uses the same stack frame as the x86-64 version. 483 * Uses the same stack frame as the x86-64 version.
408 * All registers except %eax must be saved (but ptrace may violate that) 484 * All registers except eax must be saved (but ptrace may violate that).
409 * Arguments are zero extended. For system calls that want sign extension and 485 * Arguments are zero extended. For system calls that want sign extension and
410 * take long arguments a wrapper is needed. Most calls can just be called 486 * take long arguments a wrapper is needed. Most calls can just be called
411 * directly. 487 * directly.
412 * Assumes it is only called from user space and entered with interrupts off. 488 * Assumes it is only called from user space and entered with interrupts off.
413 */ 489 */
414 490
415ENTRY(ia32_syscall) 491ENTRY(ia32_syscall)
416 CFI_STARTPROC32 simple 492 CFI_STARTPROC32 simple
417 CFI_SIGNAL_FRAME 493 CFI_SIGNAL_FRAME
418 CFI_DEF_CFA rsp,SS+8-RIP 494 CFI_DEF_CFA rsp,5*8
419 /*CFI_REL_OFFSET ss,SS-RIP*/ 495 /*CFI_REL_OFFSET ss,4*8 */
420 CFI_REL_OFFSET rsp,RSP-RIP 496 CFI_REL_OFFSET rsp,3*8
421 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 497 /*CFI_REL_OFFSET rflags,2*8 */
422 /*CFI_REL_OFFSET cs,CS-RIP*/ 498 /*CFI_REL_OFFSET cs,1*8 */
423 CFI_REL_OFFSET rip,RIP-RIP 499 CFI_REL_OFFSET rip,0*8
424 PARAVIRT_ADJUST_EXCEPTION_FRAME 500
425 SWAPGS
426 /* 501 /*
427 * No need to follow this irqs on/off section: the syscall 502 * Interrupts are off on entry.
428 * disabled irqs and here we enable it straight after entry: 503 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
504 * it is too small to ever cause noticeable irq latency.
429 */ 505 */
506 PARAVIRT_ADJUST_EXCEPTION_FRAME
507 SWAPGS
430 ENABLE_INTERRUPTS(CLBR_NONE) 508 ENABLE_INTERRUPTS(CLBR_NONE)
431 movl %eax,%eax 509
432 pushq_cfi %rax 510 /* Zero-extending 32-bit regs, do not remove */
511 movl %eax,%eax
512
513 /* Construct struct pt_regs on stack (iret frame is already on stack) */
514 pushq_cfi_reg rax /* pt_regs->orig_ax */
515 pushq_cfi_reg rdi /* pt_regs->di */
516 pushq_cfi_reg rsi /* pt_regs->si */
517 pushq_cfi_reg rdx /* pt_regs->dx */
518 pushq_cfi_reg rcx /* pt_regs->cx */
519 pushq_cfi_reg rax /* pt_regs->ax */
433 cld 520 cld
434 /* note the registers are not zero extended to the sf. 521 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
435 this could be a problem. */ 522 CFI_ADJUST_CFA_OFFSET 10*8
436 SAVE_ARGS 0,1,0 523
437 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 524 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
438 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 525 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
439 jnz ia32_tracesys 526 jnz ia32_tracesys
440 cmpq $(IA32_NR_syscalls-1),%rax 527 cmpq $(IA32_NR_syscalls-1),%rax
441 ja ia32_badsys 528 ja ia32_badsys
442ia32_do_call: 529ia32_do_call:
443 IA32_ARG_FIXUP 530 /* 32bit syscall -> 64bit C ABI argument conversion */
531 movl %edi,%r8d /* arg5 */
532 movl %ebp,%r9d /* arg6 */
533 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
534 movl %ebx,%edi /* arg1 */
535 movl %edx,%edx /* arg3 (zero extension) */
444 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative 536 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
445ia32_sysret: 537ia32_sysret:
446 movq %rax,RAX-ARGOFFSET(%rsp) 538 movq %rax,RAX(%rsp)
447ia32_ret_from_sys_call: 539ia32_ret_from_sys_call:
448 CLEAR_RREGS -ARGOFFSET 540 CLEAR_RREGS
449 jmp int_ret_from_sys_call 541 jmp int_ret_from_sys_call
450 542
451ia32_tracesys: 543ia32_tracesys:
452 SAVE_REST 544 SAVE_EXTRA_REGS
453 CLEAR_RREGS 545 CLEAR_RREGS
454 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 546 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
455 movq %rsp,%rdi /* &pt_regs -> arg1 */ 547 movq %rsp,%rdi /* &pt_regs -> arg1 */
456 call syscall_trace_enter 548 call syscall_trace_enter
457 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 549 LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
458 RESTORE_REST 550 RESTORE_EXTRA_REGS
459 cmpq $(IA32_NR_syscalls-1),%rax 551 cmpq $(IA32_NR_syscalls-1),%rax
460 ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ 552 ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
461 jmp ia32_do_call 553 jmp ia32_do_call
462END(ia32_syscall) 554END(ia32_syscall)
463 555
464ia32_badsys: 556ia32_badsys:
465 movq $0,ORIG_RAX-ARGOFFSET(%rsp) 557 movq $0,ORIG_RAX(%rsp)
466 movq $-ENOSYS,%rax 558 movq $-ENOSYS,%rax
467 jmp ia32_sysret 559 jmp ia32_sysret
468 560
@@ -479,8 +571,6 @@ GLOBAL(\label)
479 571
480 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn 572 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
481 PTREGSCALL stub32_sigreturn, sys32_sigreturn 573 PTREGSCALL stub32_sigreturn, sys32_sigreturn
482 PTREGSCALL stub32_execve, compat_sys_execve
483 PTREGSCALL stub32_execveat, compat_sys_execveat
484 PTREGSCALL stub32_fork, sys_fork 574 PTREGSCALL stub32_fork, sys_fork
485 PTREGSCALL stub32_vfork, sys_vfork 575 PTREGSCALL stub32_vfork, sys_vfork
486 576
@@ -492,24 +582,23 @@ GLOBAL(stub32_clone)
492 582
493 ALIGN 583 ALIGN
494ia32_ptregs_common: 584ia32_ptregs_common:
495 popq %r11
496 CFI_ENDPROC 585 CFI_ENDPROC
497 CFI_STARTPROC32 simple 586 CFI_STARTPROC32 simple
498 CFI_SIGNAL_FRAME 587 CFI_SIGNAL_FRAME
499 CFI_DEF_CFA rsp,SS+8-ARGOFFSET 588 CFI_DEF_CFA rsp,SIZEOF_PTREGS
500 CFI_REL_OFFSET rax,RAX-ARGOFFSET 589 CFI_REL_OFFSET rax,RAX
501 CFI_REL_OFFSET rcx,RCX-ARGOFFSET 590 CFI_REL_OFFSET rcx,RCX
502 CFI_REL_OFFSET rdx,RDX-ARGOFFSET 591 CFI_REL_OFFSET rdx,RDX
503 CFI_REL_OFFSET rsi,RSI-ARGOFFSET 592 CFI_REL_OFFSET rsi,RSI
504 CFI_REL_OFFSET rdi,RDI-ARGOFFSET 593 CFI_REL_OFFSET rdi,RDI
505 CFI_REL_OFFSET rip,RIP-ARGOFFSET 594 CFI_REL_OFFSET rip,RIP
506/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ 595/* CFI_REL_OFFSET cs,CS*/
507/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ 596/* CFI_REL_OFFSET rflags,EFLAGS*/
508 CFI_REL_OFFSET rsp,RSP-ARGOFFSET 597 CFI_REL_OFFSET rsp,RSP
509/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ 598/* CFI_REL_OFFSET ss,SS*/
510 SAVE_REST 599 SAVE_EXTRA_REGS 8
511 call *%rax 600 call *%rax
512 RESTORE_REST 601 RESTORE_EXTRA_REGS 8
513 jmp ia32_sysret /* misbalances the return cache */ 602 ret
514 CFI_ENDPROC 603 CFI_ENDPROC
515END(ia32_ptregs_common) 604END(ia32_ptregs_common)
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
deleted file mode 100644
index 51ecd5b4e787..000000000000
--- a/arch/x86/ia32/nosyscall.c
+++ /dev/null
@@ -1,7 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3
4long compat_ni_syscall(void)
5{
6 return -ENOSYS;
7}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 8e0ceecdc957..719cd702b0a4 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
201 advice); 201 advice);
202} 202}
203 203
204long sys32_vm86_warning(void)
205{
206 struct task_struct *me = current;
207 static char lastcomm[sizeof(me->comm)];
208
209 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
210 compat_printk(KERN_INFO
211 "%s: vm86 mode not supported on 64 bit kernel\n",
212 me->comm);
213 strncpy(lastcomm, me->comm, sizeof(lastcomm));
214 }
215 return -ENOSYS;
216}
217
218asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, 204asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
219 size_t count) 205 size_t count)
220{ 206{
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
deleted file mode 100644
index 4754ba0f5d9f..000000000000
--- a/arch/x86/ia32/syscall_ia32.c
+++ /dev/null
@@ -1,25 +0,0 @@
1/* System call table for ia32 emulation. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
13
14typedef void (*sys_call_ptr_t)(void);
15
16extern void compat_ni_syscall(void);
17
18const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c22a47..bdf02eeee765 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,63 @@
18 .endm 18 .endm
19#endif 19#endif
20 20
21.macro altinstruction_entry orig alt feature orig_len alt_len 21.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
22 .long \orig - . 22 .long \orig - .
23 .long \alt - . 23 .long \alt - .
24 .word \feature 24 .word \feature
25 .byte \orig_len 25 .byte \orig_len
26 .byte \alt_len 26 .byte \alt_len
27 .byte \pad_len
28.endm
29
30.macro ALTERNATIVE oldinstr, newinstr, feature
31140:
32 \oldinstr
33141:
34 .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
35142:
36
37 .pushsection .altinstructions,"a"
38 altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
39 .popsection
40
41 .pushsection .altinstr_replacement,"ax"
42143:
43 \newinstr
44144:
45 .popsection
46.endm
47
48#define old_len 141b-140b
49#define new_len1 144f-143f
50#define new_len2 145f-144f
51
52/*
53 * max without conditionals. Idea adapted from:
54 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
55 */
56#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
57
58.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
59140:
60 \oldinstr
61141:
62 .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
63 (alt_max_short(new_len1, new_len2) - (old_len)),0x90
64142:
65
66 .pushsection .altinstructions,"a"
67 altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
68 altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
69 .popsection
70
71 .pushsection .altinstr_replacement,"ax"
72143:
73 \newinstr1
74144:
75 \newinstr2
76145:
77 .popsection
27.endm 78.endm
28 79
29#endif /* __ASSEMBLY__ */ 80#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbee378a..ba32af062f61 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
48 s32 repl_offset; /* offset to replacement instruction */ 48 s32 repl_offset; /* offset to replacement instruction */
49 u16 cpuid; /* cpuid bit set for replacement */ 49 u16 cpuid; /* cpuid bit set for replacement */
50 u8 instrlen; /* length of original instruction */ 50 u8 instrlen; /* length of original instruction */
51 u8 replacementlen; /* length of new instruction, <= instrlen */ 51 u8 replacementlen; /* length of new instruction */
52}; 52 u8 padlen; /* length of build-time padding */
53} __packed;
53 54
54extern void alternative_instructions(void); 55extern void alternative_instructions(void);
55extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); 56extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end)
76} 77}
77#endif /* CONFIG_SMP */ 78#endif /* CONFIG_SMP */
78 79
79#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" 80#define b_replacement(num) "664"#num
81#define e_replacement(num) "665"#num
80 82
81#define b_replacement(number) "663"#number 83#define alt_end_marker "663"
82#define e_replacement(number) "664"#number 84#define alt_slen "662b-661b"
85#define alt_pad_len alt_end_marker"b-662b"
86#define alt_total_slen alt_end_marker"b-661b"
87#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
83 88
84#define alt_slen "662b-661b" 89#define __OLDINSTR(oldinstr, num) \
85#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" 90 "661:\n\t" oldinstr "\n662:\n" \
91 ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
92 "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
86 93
87#define ALTINSTR_ENTRY(feature, number) \ 94#define OLDINSTR(oldinstr, num) \
95 __OLDINSTR(oldinstr, num) \
96 alt_end_marker ":\n"
97
98/*
99 * max without conditionals. Idea adapted from:
100 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
101 *
102 * The additional "-" is needed because gas works with s32s.
103 */
104#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
105
106/*
107 * Pad the second replacement alternative with additional NOPs if it is
108 * additionally longer than the first replacement alternative.
109 */
110#define OLDINSTR_2(oldinstr, num1, num2) \
111 "661:\n\t" oldinstr "\n662:\n" \
112 ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \
113 "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \
114 alt_end_marker ":\n"
115
116#define ALTINSTR_ENTRY(feature, num) \
88 " .long 661b - .\n" /* label */ \ 117 " .long 661b - .\n" /* label */ \
89 " .long " b_replacement(number)"f - .\n" /* new instruction */ \ 118 " .long " b_replacement(num)"f - .\n" /* new instruction */ \
90 " .word " __stringify(feature) "\n" /* feature bit */ \ 119 " .word " __stringify(feature) "\n" /* feature bit */ \
91 " .byte " alt_slen "\n" /* source len */ \ 120 " .byte " alt_total_slen "\n" /* source len */ \
92 " .byte " alt_rlen(number) "\n" /* replacement len */ 121 " .byte " alt_rlen(num) "\n" /* replacement len */ \
93 122 " .byte " alt_pad_len "\n" /* pad len */
94#define DISCARD_ENTRY(number) /* rlen <= slen */ \
95 " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
96 123
97#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ 124#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
98 b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" 125 b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
99 126
100/* alternative assembly primitive: */ 127/* alternative assembly primitive: */
101#define ALTERNATIVE(oldinstr, newinstr, feature) \ 128#define ALTERNATIVE(oldinstr, newinstr, feature) \
102 OLDINSTR(oldinstr) \ 129 OLDINSTR(oldinstr, 1) \
103 ".pushsection .altinstructions,\"a\"\n" \ 130 ".pushsection .altinstructions,\"a\"\n" \
104 ALTINSTR_ENTRY(feature, 1) \ 131 ALTINSTR_ENTRY(feature, 1) \
105 ".popsection\n" \ 132 ".popsection\n" \
106 ".pushsection .discard,\"aw\",@progbits\n" \
107 DISCARD_ENTRY(1) \
108 ".popsection\n" \
109 ".pushsection .altinstr_replacement, \"ax\"\n" \ 133 ".pushsection .altinstr_replacement, \"ax\"\n" \
110 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ 134 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
111 ".popsection" 135 ".popsection"
112 136
113#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ 137#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
114 OLDINSTR(oldinstr) \ 138 OLDINSTR_2(oldinstr, 1, 2) \
115 ".pushsection .altinstructions,\"a\"\n" \ 139 ".pushsection .altinstructions,\"a\"\n" \
116 ALTINSTR_ENTRY(feature1, 1) \ 140 ALTINSTR_ENTRY(feature1, 1) \
117 ALTINSTR_ENTRY(feature2, 2) \ 141 ALTINSTR_ENTRY(feature2, 2) \
118 ".popsection\n" \ 142 ".popsection\n" \
119 ".pushsection .discard,\"aw\",@progbits\n" \
120 DISCARD_ENTRY(1) \
121 DISCARD_ENTRY(2) \
122 ".popsection\n" \
123 ".pushsection .altinstr_replacement, \"ax\"\n" \ 143 ".pushsection .altinstr_replacement, \"ax\"\n" \
124 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ 144 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
125 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ 145 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
@@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
146#define alternative(oldinstr, newinstr, feature) \ 166#define alternative(oldinstr, newinstr, feature) \
147 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") 167 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
148 168
169#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
170 asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
171
149/* 172/*
150 * Alternative inline assembly with input. 173 * Alternative inline assembly with input.
151 * 174 *
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22d896e..976b86a325e5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
91{ 91{
92 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); 92 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
93 93
94 alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, 94 alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
95 ASM_OUTPUT2("=r" (v), "=m" (*addr)), 95 ASM_OUTPUT2("=r" (v), "=m" (*addr)),
96 ASM_OUTPUT2("0" (v), "m" (*addr))); 96 ASM_OUTPUT2("0" (v), "m" (*addr)));
97} 97}
@@ -204,7 +204,6 @@ extern void clear_local_APIC(void);
204extern void disconnect_bsp_APIC(int virt_wire_setup); 204extern void disconnect_bsp_APIC(int virt_wire_setup);
205extern void disable_local_APIC(void); 205extern void disable_local_APIC(void);
206extern void lapic_shutdown(void); 206extern void lapic_shutdown(void);
207extern int verify_local_APIC(void);
208extern void sync_Arb_IDs(void); 207extern void sync_Arb_IDs(void);
209extern void init_bsp_APIC(void); 208extern void init_bsp_APIC(void);
210extern void setup_local_APIC(void); 209extern void setup_local_APIC(void);
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb33106e..959e45b81fe2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do { \
95 * Stop RDTSC speculation. This is needed when you need to use RDTSC 95 * Stop RDTSC speculation. This is needed when you need to use RDTSC
96 * (or get_cycles or vread that possibly accesses the TSC) in a defined 96 * (or get_cycles or vread that possibly accesses the TSC) in a defined
97 * code region. 97 * code region.
98 *
99 * (Could use an alternative three way for this if there was one.)
100 */ 98 */
101static __always_inline void rdtsc_barrier(void) 99static __always_inline void rdtsc_barrier(void)
102{ 100{
103 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); 101 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
104 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 102 "lfence", X86_FEATURE_LFENCE_RDTSC);
105} 103}
106 104
107#endif /* _ASM_X86_BARRIER_H */ 105#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1f1297b46f83..1c8b50edb2db 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with
55 * for assembly code: 55 * for assembly code:
56 */ 56 */
57 57
58#define R15 0 58/* The layout forms the "struct pt_regs" on the stack: */
59#define R14 8 59/*
60#define R13 16 60 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
61#define R12 24 61 * unless syscall needs a complete, fully filled "struct pt_regs".
62#define RBP 32 62 */
63#define RBX 40 63#define R15 0*8
64 64#define R14 1*8
65/* arguments: interrupts/non tracing syscalls only save up to here: */ 65#define R13 2*8
66#define R11 48 66#define R12 3*8
67#define R10 56 67#define RBP 4*8
68#define R9 64 68#define RBX 5*8
69#define R8 72 69/* These regs are callee-clobbered. Always saved on kernel entry. */
70#define RAX 80 70#define R11 6*8
71#define RCX 88 71#define R10 7*8
72#define RDX 96 72#define R9 8*8
73#define RSI 104 73#define R8 9*8
74#define RDI 112 74#define RAX 10*8
75#define ORIG_RAX 120 /* + error_code */ 75#define RCX 11*8
76/* end of arguments */ 76#define RDX 12*8
77 77#define RSI 13*8
78/* cpu exception frame or undefined in case of fast syscall: */ 78#define RDI 14*8
79#define RIP 128 79/*
80#define CS 136 80 * On syscall entry, this is syscall#. On CPU exception, this is error code.
81#define EFLAGS 144 81 * On hw interrupt, it's IRQ number:
82#define RSP 152 82 */
83#define SS 160 83#define ORIG_RAX 15*8
84 84/* Return frame for iretq */
85#define ARGOFFSET R11 85#define RIP 16*8
86 86#define CS 17*8
87 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 87#define EFLAGS 18*8
88 subq $9*8+\addskip, %rsp 88#define RSP 19*8
89 CFI_ADJUST_CFA_OFFSET 9*8+\addskip 89#define SS 20*8
90 movq_cfi rdi, 8*8 90
91 movq_cfi rsi, 7*8 91#define SIZEOF_PTREGS 21*8
92 movq_cfi rdx, 6*8 92
93 93 .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
94 .if \save_rcx 94 subq $15*8+\addskip, %rsp
95 movq_cfi rcx, 5*8 95 CFI_ADJUST_CFA_OFFSET 15*8+\addskip
96 .endif 96 .endm
97 97
98 .if \rax_enosys 98 .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
99 movq $-ENOSYS, 4*8(%rsp) 99 .if \r11
100 .else 100 movq_cfi r11, 6*8+\offset
101 movq_cfi rax, 4*8
102 .endif 101 .endif
103 102 .if \r8910
104 .if \save_r891011 103 movq_cfi r10, 7*8+\offset
105 movq_cfi r8, 3*8 104 movq_cfi r9, 8*8+\offset
106 movq_cfi r9, 2*8 105 movq_cfi r8, 9*8+\offset
107 movq_cfi r10, 1*8 106 .endif
108 movq_cfi r11, 0*8 107 .if \rax
108 movq_cfi rax, 10*8+\offset
109 .endif
110 .if \rcx
111 movq_cfi rcx, 11*8+\offset
109 .endif 112 .endif
113 movq_cfi rdx, 12*8+\offset
114 movq_cfi rsi, 13*8+\offset
115 movq_cfi rdi, 14*8+\offset
116 .endm
117 .macro SAVE_C_REGS offset=0
118 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
119 .endm
120 .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
121 SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
122 .endm
123 .macro SAVE_C_REGS_EXCEPT_R891011
124 SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
125 .endm
126 .macro SAVE_C_REGS_EXCEPT_RCX_R891011
127 SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
128 .endm
129 .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
130 SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
131 .endm
132
133 .macro SAVE_EXTRA_REGS offset=0
134 movq_cfi r15, 0*8+\offset
135 movq_cfi r14, 1*8+\offset
136 movq_cfi r13, 2*8+\offset
137 movq_cfi r12, 3*8+\offset
138 movq_cfi rbp, 4*8+\offset
139 movq_cfi rbx, 5*8+\offset
140 .endm
141 .macro SAVE_EXTRA_REGS_RBP offset=0
142 movq_cfi rbp, 4*8+\offset
143 .endm
110 144
145 .macro RESTORE_EXTRA_REGS offset=0
146 movq_cfi_restore 0*8+\offset, r15
147 movq_cfi_restore 1*8+\offset, r14
148 movq_cfi_restore 2*8+\offset, r13
149 movq_cfi_restore 3*8+\offset, r12
150 movq_cfi_restore 4*8+\offset, rbp
151 movq_cfi_restore 5*8+\offset, rbx
111 .endm 152 .endm
112 153
113#define ARG_SKIP (9*8) 154 .macro ZERO_EXTRA_REGS
155 xorl %r15d, %r15d
156 xorl %r14d, %r14d
157 xorl %r13d, %r13d
158 xorl %r12d, %r12d
159 xorl %ebp, %ebp
160 xorl %ebx, %ebx
161 .endm
114 162
115 .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ 163 .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
116 rstor_r8910=1, rstor_rdx=1
117 .if \rstor_r11 164 .if \rstor_r11
118 movq_cfi_restore 0*8, r11 165 movq_cfi_restore 6*8, r11
119 .endif 166 .endif
120
121 .if \rstor_r8910 167 .if \rstor_r8910
122 movq_cfi_restore 1*8, r10 168 movq_cfi_restore 7*8, r10
123 movq_cfi_restore 2*8, r9 169 movq_cfi_restore 8*8, r9
124 movq_cfi_restore 3*8, r8 170 movq_cfi_restore 9*8, r8
125 .endif 171 .endif
126
127 .if \rstor_rax 172 .if \rstor_rax
128 movq_cfi_restore 4*8, rax 173 movq_cfi_restore 10*8, rax
129 .endif 174 .endif
130
131 .if \rstor_rcx 175 .if \rstor_rcx
132 movq_cfi_restore 5*8, rcx 176 movq_cfi_restore 11*8, rcx
133 .endif 177 .endif
134
135 .if \rstor_rdx 178 .if \rstor_rdx
136 movq_cfi_restore 6*8, rdx 179 movq_cfi_restore 12*8, rdx
137 .endif
138
139 movq_cfi_restore 7*8, rsi
140 movq_cfi_restore 8*8, rdi
141
142 .if ARG_SKIP+\addskip > 0
143 addq $ARG_SKIP+\addskip, %rsp
144 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
145 .endif 180 .endif
181 movq_cfi_restore 13*8, rsi
182 movq_cfi_restore 14*8, rdi
146 .endm 183 .endm
147 184 .macro RESTORE_C_REGS
148 .macro LOAD_ARGS offset, skiprax=0 185 RESTORE_C_REGS_HELPER 1,1,1,1,1
149 movq \offset(%rsp), %r11
150 movq \offset+8(%rsp), %r10
151 movq \offset+16(%rsp), %r9
152 movq \offset+24(%rsp), %r8
153 movq \offset+40(%rsp), %rcx
154 movq \offset+48(%rsp), %rdx
155 movq \offset+56(%rsp), %rsi
156 movq \offset+64(%rsp), %rdi
157 .if \skiprax
158 .else
159 movq \offset+72(%rsp), %rax
160 .endif
161 .endm 186 .endm
162 187 .macro RESTORE_C_REGS_EXCEPT_RAX
163#define REST_SKIP (6*8) 188 RESTORE_C_REGS_HELPER 0,1,1,1,1
164
165 .macro SAVE_REST
166 subq $REST_SKIP, %rsp
167 CFI_ADJUST_CFA_OFFSET REST_SKIP
168 movq_cfi rbx, 5*8
169 movq_cfi rbp, 4*8
170 movq_cfi r12, 3*8
171 movq_cfi r13, 2*8
172 movq_cfi r14, 1*8
173 movq_cfi r15, 0*8
174 .endm 189 .endm
175 190 .macro RESTORE_C_REGS_EXCEPT_RCX
176 .macro RESTORE_REST 191 RESTORE_C_REGS_HELPER 1,0,1,1,1
177 movq_cfi_restore 0*8, r15
178 movq_cfi_restore 1*8, r14
179 movq_cfi_restore 2*8, r13
180 movq_cfi_restore 3*8, r12
181 movq_cfi_restore 4*8, rbp
182 movq_cfi_restore 5*8, rbx
183 addq $REST_SKIP, %rsp
184 CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
185 .endm 192 .endm
186 193 .macro RESTORE_C_REGS_EXCEPT_R11
187 .macro SAVE_ALL 194 RESTORE_C_REGS_HELPER 1,1,0,1,1
188 SAVE_ARGS 195 .endm
189 SAVE_REST 196 .macro RESTORE_C_REGS_EXCEPT_RCX_R11
197 RESTORE_C_REGS_HELPER 1,0,0,1,1
198 .endm
199 .macro RESTORE_RSI_RDI
200 RESTORE_C_REGS_HELPER 0,0,0,0,0
201 .endm
202 .macro RESTORE_RSI_RDI_RDX
203 RESTORE_C_REGS_HELPER 0,0,0,0,1
190 .endm 204 .endm
191 205
192 .macro RESTORE_ALL addskip=0 206 .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
193 RESTORE_REST 207 addq $15*8+\addskip, %rsp
194 RESTORE_ARGS 1, \addskip 208 CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
195 .endm 209 .endm
196 210
197 .macro icebp 211 .macro icebp
@@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with
210 */ 224 */
211 225
212 .macro SAVE_ALL 226 .macro SAVE_ALL
213 pushl_cfi %eax 227 pushl_cfi_reg eax
214 CFI_REL_OFFSET eax, 0 228 pushl_cfi_reg ebp
215 pushl_cfi %ebp 229 pushl_cfi_reg edi
216 CFI_REL_OFFSET ebp, 0 230 pushl_cfi_reg esi
217 pushl_cfi %edi 231 pushl_cfi_reg edx
218 CFI_REL_OFFSET edi, 0 232 pushl_cfi_reg ecx
219 pushl_cfi %esi 233 pushl_cfi_reg ebx
220 CFI_REL_OFFSET esi, 0
221 pushl_cfi %edx
222 CFI_REL_OFFSET edx, 0
223 pushl_cfi %ecx
224 CFI_REL_OFFSET ecx, 0
225 pushl_cfi %ebx
226 CFI_REL_OFFSET ebx, 0
227 .endm 234 .endm
228 235
229 .macro RESTORE_ALL 236 .macro RESTORE_ALL
230 popl_cfi %ebx 237 popl_cfi_reg ebx
231 CFI_RESTORE ebx 238 popl_cfi_reg ecx
232 popl_cfi %ecx 239 popl_cfi_reg edx
233 CFI_RESTORE ecx 240 popl_cfi_reg esi
234 popl_cfi %edx 241 popl_cfi_reg edi
235 CFI_RESTORE edx 242 popl_cfi_reg ebp
236 popl_cfi %esi 243 popl_cfi_reg eax
237 CFI_RESTORE esi
238 popl_cfi %edi
239 CFI_RESTORE edi
240 popl_cfi %ebp
241 CFI_RESTORE ebp
242 popl_cfi %eax
243 CFI_RESTORE eax
244 .endm 244 .endm
245 245
246#endif /* CONFIG_X86_64 */ 246#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c401f79f..acdee09228b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
301 sp = task_pt_regs(current)->sp; 301 sp = task_pt_regs(current)->sp;
302 } else { 302 } else {
303 /* -128 for the x32 ABI redzone */ 303 /* -128 for the x32 ABI redzone */
304 sp = this_cpu_read(old_rsp) - 128; 304 sp = task_pt_regs(current)->sp - 128;
305 } 305 }
306 306
307 return (void __user *)round_down(sp - len, 16); 307 return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
34#endif 34#endif
35#endif 35#endif
36 36
37DECLARE_PER_CPU(int, cpu_state);
38
39int mwait_usable(const struct cpuinfo_x86 *); 37int mwait_usable(const struct cpuinfo_x86 *);
40 38
41#endif /* _ASM_X86_CPU_H */ 39#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index c1553b70fed4..7ee9b94d9921 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -233,7 +233,9 @@
233#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ 233#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
234#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ 234#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
235#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ 235#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
236#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
236#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ 237#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
238#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
237#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ 239#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
238#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ 240#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
239#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ 241#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
@@ -426,6 +428,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
426 " .word %P0\n" /* 1: do replace */ 428 " .word %P0\n" /* 1: do replace */
427 " .byte 2b - 1b\n" /* source len */ 429 " .byte 2b - 1b\n" /* source len */
428 " .byte 0\n" /* replacement len */ 430 " .byte 0\n" /* replacement len */
431 " .byte 0\n" /* pad len */
429 ".previous\n" 432 ".previous\n"
430 /* skipping size check since replacement size = 0 */ 433 /* skipping size check since replacement size = 0 */
431 : : "i" (X86_FEATURE_ALWAYS) : : t_warn); 434 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -440,6 +443,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
440 " .word %P0\n" /* feature bit */ 443 " .word %P0\n" /* feature bit */
441 " .byte 2b - 1b\n" /* source len */ 444 " .byte 2b - 1b\n" /* source len */
442 " .byte 0\n" /* replacement len */ 445 " .byte 0\n" /* replacement len */
446 " .byte 0\n" /* pad len */
443 ".previous\n" 447 ".previous\n"
444 /* skipping size check since replacement size = 0 */ 448 /* skipping size check since replacement size = 0 */
445 : : "i" (bit) : : t_no); 449 : : "i" (bit) : : t_no);
@@ -465,6 +469,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
465 " .word %P1\n" /* feature bit */ 469 " .word %P1\n" /* feature bit */
466 " .byte 2b - 1b\n" /* source len */ 470 " .byte 2b - 1b\n" /* source len */
467 " .byte 4f - 3f\n" /* replacement len */ 471 " .byte 4f - 3f\n" /* replacement len */
472 " .byte 0\n" /* pad len */
468 ".previous\n" 473 ".previous\n"
469 ".section .discard,\"aw\",@progbits\n" 474 ".section .discard,\"aw\",@progbits\n"
470 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ 475 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -491,31 +496,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
491static __always_inline __pure bool _static_cpu_has_safe(u16 bit) 496static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
492{ 497{
493#ifdef CC_HAVE_ASM_GOTO 498#ifdef CC_HAVE_ASM_GOTO
494/* 499 asm_volatile_goto("1: jmp %l[t_dynamic]\n"
495 * We need to spell the jumps to the compiler because, depending on the offset,
496 * the replacement jump can be bigger than the original jump, and this we cannot
497 * have. Thus, we force the jump to the widest, 4-byte, signed relative
498 * offset even though the last would often fit in less bytes.
499 */
500 asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
501 "2:\n" 500 "2:\n"
501 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
502 "((5f-4f) - (2b-1b)),0x90\n"
503 "3:\n"
502 ".section .altinstructions,\"a\"\n" 504 ".section .altinstructions,\"a\"\n"
503 " .long 1b - .\n" /* src offset */ 505 " .long 1b - .\n" /* src offset */
504 " .long 3f - .\n" /* repl offset */ 506 " .long 4f - .\n" /* repl offset */
505 " .word %P1\n" /* always replace */ 507 " .word %P1\n" /* always replace */
506 " .byte 2b - 1b\n" /* src len */ 508 " .byte 3b - 1b\n" /* src len */
507 " .byte 4f - 3f\n" /* repl len */ 509 " .byte 5f - 4f\n" /* repl len */
510 " .byte 3b - 2b\n" /* pad len */
508 ".previous\n" 511 ".previous\n"
509 ".section .altinstr_replacement,\"ax\"\n" 512 ".section .altinstr_replacement,\"ax\"\n"
510 "3: .byte 0xe9\n .long %l[t_no] - 2b\n" 513 "4: jmp %l[t_no]\n"
511 "4:\n" 514 "5:\n"
512 ".previous\n" 515 ".previous\n"
513 ".section .altinstructions,\"a\"\n" 516 ".section .altinstructions,\"a\"\n"
514 " .long 1b - .\n" /* src offset */ 517 " .long 1b - .\n" /* src offset */
515 " .long 0\n" /* no replacement */ 518 " .long 0\n" /* no replacement */
516 " .word %P0\n" /* feature bit */ 519 " .word %P0\n" /* feature bit */
517 " .byte 2b - 1b\n" /* src len */ 520 " .byte 3b - 1b\n" /* src len */
518 " .byte 0\n" /* repl len */ 521 " .byte 0\n" /* repl len */
522 " .byte 0\n" /* pad len */
519 ".previous\n" 523 ".previous\n"
520 : : "i" (bit), "i" (X86_FEATURE_ALWAYS) 524 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
521 : : t_dynamic, t_no); 525 : : t_dynamic, t_no);
@@ -535,6 +539,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
535 " .word %P2\n" /* always replace */ 539 " .word %P2\n" /* always replace */
536 " .byte 2b - 1b\n" /* source len */ 540 " .byte 2b - 1b\n" /* source len */
537 " .byte 4f - 3f\n" /* replacement len */ 541 " .byte 4f - 3f\n" /* replacement len */
542 " .byte 0\n" /* pad len */
538 ".previous\n" 543 ".previous\n"
539 ".section .discard,\"aw\",@progbits\n" 544 ".section .discard,\"aw\",@progbits\n"
540 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ 545 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -549,6 +554,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
549 " .word %P1\n" /* feature bit */ 554 " .word %P1\n" /* feature bit */
550 " .byte 4b - 3b\n" /* src len */ 555 " .byte 4b - 3b\n" /* src len */
551 " .byte 6f - 5f\n" /* repl len */ 556 " .byte 6f - 5f\n" /* repl len */
557 " .byte 0\n" /* pad len */
552 ".previous\n" 558 ".previous\n"
553 ".section .discard,\"aw\",@progbits\n" 559 ".section .discard,\"aw\",@progbits\n"
554 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ 560 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a94b82e8f156..a0bf89fd2647 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
376 * Pentium F0 0F bugfix can have resulted in the mapped 376 * Pentium F0 0F bugfix can have resulted in the mapped
377 * IDT being write-protected. 377 * IDT being write-protected.
378 */ 378 */
379#define set_intr_gate(n, addr) \ 379#define set_intr_gate_notrace(n, addr) \
380 do { \ 380 do { \
381 BUG_ON((unsigned)n > 0xFF); \ 381 BUG_ON((unsigned)n > 0xFF); \
382 _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ 382 _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \
383 __KERNEL_CS); \ 383 __KERNEL_CS); \
384 } while (0)
385
386#define set_intr_gate(n, addr) \
387 do { \
388 set_intr_gate_notrace(n, addr); \
384 _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ 389 _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
385 0, 0, __KERNEL_CS); \ 390 0, 0, __KERNEL_CS); \
386 } while (0) 391 } while (0)
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index f6f15986df6c..de1cdaf4d743 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -86,11 +86,23 @@
86 CFI_ADJUST_CFA_OFFSET 8 86 CFI_ADJUST_CFA_OFFSET 8
87 .endm 87 .endm
88 88
89 .macro pushq_cfi_reg reg
90 pushq %\reg
91 CFI_ADJUST_CFA_OFFSET 8
92 CFI_REL_OFFSET \reg, 0
93 .endm
94
89 .macro popq_cfi reg 95 .macro popq_cfi reg
90 popq \reg 96 popq \reg
91 CFI_ADJUST_CFA_OFFSET -8 97 CFI_ADJUST_CFA_OFFSET -8
92 .endm 98 .endm
93 99
100 .macro popq_cfi_reg reg
101 popq %\reg
102 CFI_ADJUST_CFA_OFFSET -8
103 CFI_RESTORE \reg
104 .endm
105
94 .macro pushfq_cfi 106 .macro pushfq_cfi
95 pushfq 107 pushfq
96 CFI_ADJUST_CFA_OFFSET 8 108 CFI_ADJUST_CFA_OFFSET 8
@@ -116,11 +128,23 @@
116 CFI_ADJUST_CFA_OFFSET 4 128 CFI_ADJUST_CFA_OFFSET 4
117 .endm 129 .endm
118 130
131 .macro pushl_cfi_reg reg
132 pushl %\reg
133 CFI_ADJUST_CFA_OFFSET 4
134 CFI_REL_OFFSET \reg, 0
135 .endm
136
119 .macro popl_cfi reg 137 .macro popl_cfi reg
120 popl \reg 138 popl \reg
121 CFI_ADJUST_CFA_OFFSET -4 139 CFI_ADJUST_CFA_OFFSET -4
122 .endm 140 .endm
123 141
142 .macro popl_cfi_reg reg
143 popl %\reg
144 CFI_ADJUST_CFA_OFFSET -4
145 CFI_RESTORE \reg
146 .endm
147
124 .macro pushfl_cfi 148 .macro pushfl_cfi
125 pushfl 149 pushfl
126 CFI_ADJUST_CFA_OFFSET 4 150 CFI_ADJUST_CFA_OFFSET 4
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
40} 40}
41#endif 41#endif
42 42
43#ifdef CONFIG_MEMTEST
44extern void early_memtest(unsigned long start, unsigned long end);
45#else
46static inline void early_memtest(unsigned long start, unsigned long end)
47{
48}
49#endif
50
51extern unsigned long e820_end_of_ram_pfn(void); 43extern unsigned long e820_end_of_ram_pfn(void);
52extern unsigned long e820_end_of_low_ram_pfn(void); 44extern unsigned long e820_end_of_low_ram_pfn(void);
53extern u64 early_reserve_e820(u64 sizet, u64 align); 45extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 25bce45c6fc4..3738b138b843 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -2,6 +2,8 @@
2#define _ASM_X86_EFI_H 2#define _ASM_X86_EFI_H
3 3
4#include <asm/i387.h> 4#include <asm/i387.h>
5#include <asm/pgtable.h>
6
5/* 7/*
6 * We map the EFI regions needed for runtime services non-contiguously, 8 * We map the EFI regions needed for runtime services non-contiguously,
7 * with preserved alignment on virtual addresses starting from -4G down 9 * with preserved alignment on virtual addresses starting from -4G down
@@ -89,8 +91,8 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
89extern struct efi_scratch efi_scratch; 91extern struct efi_scratch efi_scratch;
90extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); 92extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
91extern int __init efi_memblock_x86_reserve_range(void); 93extern int __init efi_memblock_x86_reserve_range(void);
92extern void __init efi_call_phys_prolog(void); 94extern pgd_t * __init efi_call_phys_prolog(void);
93extern void __init efi_call_phys_epilog(void); 95extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
94extern void __init efi_unmap_memmap(void); 96extern void __init efi_unmap_memmap(void);
95extern void __init efi_memory_uc(u64 addr, unsigned long size); 97extern void __init efi_memory_uc(u64 addr, unsigned long size);
96extern void __init efi_map_region(efi_memory_desc_t *md); 98extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ca3347a9dab5..f161c189c27b 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -171,10 +171,11 @@ do { \
171static inline void elf_common_init(struct thread_struct *t, 171static inline void elf_common_init(struct thread_struct *t,
172 struct pt_regs *regs, const u16 ds) 172 struct pt_regs *regs, const u16 ds)
173{ 173{
174 regs->ax = regs->bx = regs->cx = regs->dx = 0; 174 /* Commented-out registers are cleared in stub_execve */
175 regs->si = regs->di = regs->bp = 0; 175 /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0;
176 regs->si = regs->di /*= regs->bp*/ = 0;
176 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; 177 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
177 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 178 /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/
178 t->fs = t->gs = 0; 179 t->fs = t->gs = 0;
179 t->fsindex = t->gsindex = 0; 180 t->fsindex = t->gsindex = 0;
180 t->ds = t->es = ds; 181 t->ds = t->es = ds;
@@ -338,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
338 int uses_interp); 339 int uses_interp);
339#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages 340#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
340 341
341extern unsigned long arch_randomize_brk(struct mm_struct *mm);
342#define arch_randomize_brk arch_randomize_brk
343
344/* 342/*
345 * True on X86_32 or when emulating IA32 on X86_64 343 * True on X86_32 or when emulating IA32 on X86_64
346 */ 344 */
@@ -365,6 +363,7 @@ enum align_flags {
365struct va_alignment { 363struct va_alignment {
366 int flags; 364 int flags;
367 unsigned long mask; 365 unsigned long mask;
366 unsigned long bits;
368} ____cacheline_aligned; 367} ____cacheline_aligned;
369 368
370extern struct va_alignment va_align; 369extern struct va_alignment va_align;
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 72ba21a8b5fc..da5e96756570 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -67,6 +67,34 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
67static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} 67static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
68#endif 68#endif
69 69
70/*
71 * Must be run with preemption disabled: this clears the fpu_owner_task,
72 * on this CPU.
73 *
74 * This will disable any lazy FPU state restore of the current FPU state,
75 * but if the current thread owns the FPU, it will still be saved by.
76 */
77static inline void __cpu_disable_lazy_restore(unsigned int cpu)
78{
79 per_cpu(fpu_owner_task, cpu) = NULL;
80}
81
82/*
83 * Used to indicate that the FPU state in memory is newer than the FPU
84 * state in registers, and the FPU state should be reloaded next time the
85 * task is run. Only safe on the current task, or non-running tasks.
86 */
87static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
88{
89 tsk->thread.fpu.last_cpu = ~0;
90}
91
92static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
93{
94 return new == this_cpu_read_stable(fpu_owner_task) &&
95 cpu == new->thread.fpu.last_cpu;
96}
97
70static inline int is_ia32_compat_frame(void) 98static inline int is_ia32_compat_frame(void)
71{ 99{
72 return config_enabled(CONFIG_IA32_EMULATION) && 100 return config_enabled(CONFIG_IA32_EMULATION) &&
@@ -107,7 +135,6 @@ static __always_inline __pure bool use_fxsr(void)
107 135
108static inline void fx_finit(struct i387_fxsave_struct *fx) 136static inline void fx_finit(struct i387_fxsave_struct *fx)
109{ 137{
110 memset(fx, 0, xstate_size);
111 fx->cwd = 0x37f; 138 fx->cwd = 0x37f;
112 fx->mxcsr = MXCSR_DEFAULT; 139 fx->mxcsr = MXCSR_DEFAULT;
113} 140}
@@ -351,8 +378,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk)
351 __thread_set_has_fpu(tsk); 378 __thread_set_has_fpu(tsk);
352} 379}
353 380
354static inline void __drop_fpu(struct task_struct *tsk) 381static inline void drop_fpu(struct task_struct *tsk)
355{ 382{
383 /*
384 * Forget coprocessor state..
385 */
386 preempt_disable();
387 tsk->thread.fpu_counter = 0;
388
356 if (__thread_has_fpu(tsk)) { 389 if (__thread_has_fpu(tsk)) {
357 /* Ignore delayed exceptions from user space */ 390 /* Ignore delayed exceptions from user space */
358 asm volatile("1: fwait\n" 391 asm volatile("1: fwait\n"
@@ -360,30 +393,29 @@ static inline void __drop_fpu(struct task_struct *tsk)
360 _ASM_EXTABLE(1b, 2b)); 393 _ASM_EXTABLE(1b, 2b));
361 __thread_fpu_end(tsk); 394 __thread_fpu_end(tsk);
362 } 395 }
363}
364 396
365static inline void drop_fpu(struct task_struct *tsk)
366{
367 /*
368 * Forget coprocessor state..
369 */
370 preempt_disable();
371 tsk->thread.fpu_counter = 0;
372 __drop_fpu(tsk);
373 clear_stopped_child_used_math(tsk); 397 clear_stopped_child_used_math(tsk);
374 preempt_enable(); 398 preempt_enable();
375} 399}
376 400
377static inline void drop_init_fpu(struct task_struct *tsk) 401static inline void restore_init_xstate(void)
402{
403 if (use_xsave())
404 xrstor_state(init_xstate_buf, -1);
405 else
406 fxrstor_checking(&init_xstate_buf->i387);
407}
408
409/*
410 * Reset the FPU state in the eager case and drop it in the lazy case (later use
411 * will reinit it).
412 */
413static inline void fpu_reset_state(struct task_struct *tsk)
378{ 414{
379 if (!use_eager_fpu()) 415 if (!use_eager_fpu())
380 drop_fpu(tsk); 416 drop_fpu(tsk);
381 else { 417 else
382 if (use_xsave()) 418 restore_init_xstate();
383 xrstor_state(init_xstate_buf, -1);
384 else
385 fxrstor_checking(&init_xstate_buf->i387);
386 }
387} 419}
388 420
389/* 421/*
@@ -400,24 +432,6 @@ static inline void drop_init_fpu(struct task_struct *tsk)
400 */ 432 */
401typedef struct { int preload; } fpu_switch_t; 433typedef struct { int preload; } fpu_switch_t;
402 434
403/*
404 * Must be run with preemption disabled: this clears the fpu_owner_task,
405 * on this CPU.
406 *
407 * This will disable any lazy FPU state restore of the current FPU state,
408 * but if the current thread owns the FPU, it will still be saved by.
409 */
410static inline void __cpu_disable_lazy_restore(unsigned int cpu)
411{
412 per_cpu(fpu_owner_task, cpu) = NULL;
413}
414
415static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
416{
417 return new == this_cpu_read_stable(fpu_owner_task) &&
418 cpu == new->thread.fpu.last_cpu;
419}
420
421static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) 435static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
422{ 436{
423 fpu_switch_t fpu; 437 fpu_switch_t fpu;
@@ -426,13 +440,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
426 * If the task has used the math, pre-load the FPU on xsave processors 440 * If the task has used the math, pre-load the FPU on xsave processors
427 * or if the past 5 consecutive context-switches used math. 441 * or if the past 5 consecutive context-switches used math.
428 */ 442 */
429 fpu.preload = tsk_used_math(new) && (use_eager_fpu() || 443 fpu.preload = tsk_used_math(new) &&
430 new->thread.fpu_counter > 5); 444 (use_eager_fpu() || new->thread.fpu_counter > 5);
445
431 if (__thread_has_fpu(old)) { 446 if (__thread_has_fpu(old)) {
432 if (!__save_init_fpu(old)) 447 if (!__save_init_fpu(old))
433 cpu = ~0; 448 task_disable_lazy_fpu_restore(old);
434 old->thread.fpu.last_cpu = cpu; 449 else
435 old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ 450 old->thread.fpu.last_cpu = cpu;
451
452 /* But leave fpu_owner_task! */
453 old->thread.fpu.has_fpu = 0;
436 454
437 /* Don't change CR0.TS if we just switch! */ 455 /* Don't change CR0.TS if we just switch! */
438 if (fpu.preload) { 456 if (fpu.preload) {
@@ -443,10 +461,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
443 stts(); 461 stts();
444 } else { 462 } else {
445 old->thread.fpu_counter = 0; 463 old->thread.fpu_counter = 0;
446 old->thread.fpu.last_cpu = ~0; 464 task_disable_lazy_fpu_restore(old);
447 if (fpu.preload) { 465 if (fpu.preload) {
448 new->thread.fpu_counter++; 466 new->thread.fpu_counter++;
449 if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) 467 if (fpu_lazy_restore(new, cpu))
450 fpu.preload = 0; 468 fpu.preload = 0;
451 else 469 else
452 prefetch(new->thread.fpu.state); 470 prefetch(new->thread.fpu.state);
@@ -466,7 +484,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
466{ 484{
467 if (fpu.preload) { 485 if (fpu.preload) {
468 if (unlikely(restore_fpu_checking(new))) 486 if (unlikely(restore_fpu_checking(new)))
469 drop_init_fpu(new); 487 fpu_reset_state(new);
470 } 488 }
471} 489}
472 490
@@ -495,10 +513,12 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
495} 513}
496 514
497/* 515/*
498 * Need to be preemption-safe. 516 * Needs to be preemption-safe.
499 * 517 *
500 * NOTE! user_fpu_begin() must be used only immediately before restoring 518 * NOTE! user_fpu_begin() must be used only immediately before restoring
501 * it. This function does not do any save/restore on their own. 519 * the save state. It does not do any saving/restoring on its own. In
520 * lazy FPU mode, it is just an optimization to avoid a #NM exception,
521 * the task can lose the FPU right after preempt_enable().
502 */ 522 */
503static inline void user_fpu_begin(void) 523static inline void user_fpu_begin(void)
504{ 524{
@@ -520,24 +540,6 @@ static inline void __save_fpu(struct task_struct *tsk)
520} 540}
521 541
522/* 542/*
523 * These disable preemption on their own and are safe
524 */
525static inline void save_init_fpu(struct task_struct *tsk)
526{
527 WARN_ON_ONCE(!__thread_has_fpu(tsk));
528
529 if (use_eager_fpu()) {
530 __save_fpu(tsk);
531 return;
532 }
533
534 preempt_disable();
535 __save_init_fpu(tsk);
536 __thread_fpu_end(tsk);
537 preempt_enable();
538}
539
540/*
541 * i387 state interaction 543 * i387 state interaction
542 */ 544 */
543static inline unsigned short get_fpu_cwd(struct task_struct *tsk) 545static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 9662290e0b20..e9571ddabc4f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
181extern __visible void smp_invalidate_interrupt(struct pt_regs *); 181extern __visible void smp_invalidate_interrupt(struct pt_regs *);
182#endif 182#endif
183 183
184extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR 184extern char irq_entries_start[];
185 - FIRST_EXTERNAL_VECTOR])(void);
186#ifdef CONFIG_TRACING 185#ifdef CONFIG_TRACING
187#define trace_interrupt interrupt 186#define trace_irq_entries_start irq_entries_start
188#endif 187#endif
189 188
190#define VECTOR_UNDEFINED (-1) 189#define VECTOR_UNDEFINED (-1)
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 47f29b1d1846..e7814b74caf8 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -69,7 +69,7 @@ struct insn {
69 const insn_byte_t *next_byte; 69 const insn_byte_t *next_byte;
70}; 70};
71 71
72#define MAX_INSN_SIZE 16 72#define MAX_INSN_SIZE 15
73 73
74#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) 74#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
75#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) 75#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
index f42a04735a0a..e37d6b3ad983 100644
--- a/arch/x86/include/asm/iommu_table.h
+++ b/arch/x86/include/asm/iommu_table.h
@@ -79,11 +79,12 @@ struct iommu_table_entry {
79 * d). Similar to the 'init', except that this gets called from pci_iommu_init 79 * d). Similar to the 'init', except that this gets called from pci_iommu_init
80 * where we do have a memory allocator. 80 * where we do have a memory allocator.
81 * 81 *
82 * The standard vs the _FINISH differs in that the _FINISH variant will 82 * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant
83 * continue detecting other IOMMUs in the call list after the 83 * in that the former will continue detecting other IOMMUs in the call
84 * the detection routine returns a positive number. The _FINISH will 84 * list after the detection routine returns a positive number, while the
85 * stop the execution chain. Both will still call the 'init' and 85 * latter will stop the execution chain upon first successful detection.
86 * 'late_init' functions if they are set. 86 * Both variants will still call the 'init' and 'late_init' functions if
87 * they are set.
87 */ 88 */
88#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ 89#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
89 __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) 90 __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 0a8b519226b8..b77f5edb03b0 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void)
136#define USERGS_SYSRET32 \ 136#define USERGS_SYSRET32 \
137 swapgs; \ 137 swapgs; \
138 sysretl 138 sysretl
139#define ENABLE_INTERRUPTS_SYSEXIT32 \
140 swapgs; \
141 sti; \
142 sysexit
143 139
144#else 140#else
145#define INTERRUPT_RETURN iret 141#define INTERRUPT_RETURN iret
@@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void)
163 159
164 return arch_irqs_disabled_flags(flags); 160 return arch_irqs_disabled_flags(flags);
165} 161}
162#endif /* !__ASSEMBLY__ */
166 163
164#ifdef __ASSEMBLY__
165#ifdef CONFIG_TRACE_IRQFLAGS
166# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
167# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
167#else 168#else
168 169# define TRACE_IRQS_ON
169#ifdef CONFIG_X86_64 170# define TRACE_IRQS_OFF
170#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk 171#endif
171#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ 172#ifdef CONFIG_DEBUG_LOCK_ALLOC
173# ifdef CONFIG_X86_64
174# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
175# define LOCKDEP_SYS_EXIT_IRQ \
172 TRACE_IRQS_ON; \ 176 TRACE_IRQS_ON; \
173 sti; \ 177 sti; \
174 SAVE_REST; \ 178 call lockdep_sys_exit_thunk; \
175 LOCKDEP_SYS_EXIT; \
176 RESTORE_REST; \
177 cli; \ 179 cli; \
178 TRACE_IRQS_OFF; 180 TRACE_IRQS_OFF;
179 181# else
180#else 182# define LOCKDEP_SYS_EXIT \
181#define ARCH_LOCKDEP_SYS_EXIT \
182 pushl %eax; \ 183 pushl %eax; \
183 pushl %ecx; \ 184 pushl %ecx; \
184 pushl %edx; \ 185 pushl %edx; \
@@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void)
186 popl %edx; \ 187 popl %edx; \
187 popl %ecx; \ 188 popl %ecx; \
188 popl %eax; 189 popl %eax;
189 190# define LOCKDEP_SYS_EXIT_IRQ
190#define ARCH_LOCKDEP_SYS_EXIT_IRQ 191# endif
191#endif
192
193#ifdef CONFIG_TRACE_IRQFLAGS
194# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
195# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
196#else 192#else
197# define TRACE_IRQS_ON
198# define TRACE_IRQS_OFF
199#endif
200#ifdef CONFIG_DEBUG_LOCK_ALLOC
201# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
202# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
203# else
204# define LOCKDEP_SYS_EXIT 193# define LOCKDEP_SYS_EXIT
205# define LOCKDEP_SYS_EXIT_IRQ 194# define LOCKDEP_SYS_EXIT_IRQ
206# endif 195#endif
207
208#endif /* __ASSEMBLY__ */ 196#endif /* __ASSEMBLY__ */
197
209#endif 198#endif
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 6a2cefb4395a..a4c1cf7e93f8 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_X86_JUMP_LABEL_H 1#ifndef _ASM_X86_JUMP_LABEL_H
2#define _ASM_X86_JUMP_LABEL_H 2#define _ASM_X86_JUMP_LABEL_H
3 3
4#ifdef __KERNEL__ 4#ifndef __ASSEMBLY__
5 5
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/types.h> 7#include <linux/types.h>
@@ -30,8 +30,6 @@ l_yes:
30 return true; 30 return true;
31} 31}
32 32
33#endif /* __KERNEL__ */
34
35#ifdef CONFIG_X86_64 33#ifdef CONFIG_X86_64
36typedef u64 jump_label_t; 34typedef u64 jump_label_t;
37#else 35#else
@@ -44,4 +42,5 @@ struct jump_entry {
44 jump_label_t key; 42 jump_label_t key;
45}; 43};
46 44
45#endif /* __ASSEMBLY__ */
47#endif 46#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a236e39cc385..dea2e7e962e3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -81,11 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
81 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 81 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
82} 82}
83 83
84#define SELECTOR_TI_MASK (1 << 2)
85#define SELECTOR_RPL_MASK 0x03
86
87#define IOPL_SHIFT 12
88
89#define KVM_PERMILLE_MMU_PAGES 20 84#define KVM_PERMILLE_MMU_PAGES 20
90#define KVM_MIN_ALLOC_MMU_PAGES 64 85#define KVM_MIN_ALLOC_MMU_PAGES 64
91#define KVM_MMU_HASH_SHIFT 10 86#define KVM_MMU_HASH_SHIFT 10
@@ -345,6 +340,7 @@ struct kvm_pmu {
345enum { 340enum {
346 KVM_DEBUGREG_BP_ENABLED = 1, 341 KVM_DEBUGREG_BP_ENABLED = 1,
347 KVM_DEBUGREG_WONT_EXIT = 2, 342 KVM_DEBUGREG_WONT_EXIT = 2,
343 KVM_DEBUGREG_RELOAD = 4,
348}; 344};
349 345
350struct kvm_vcpu_arch { 346struct kvm_vcpu_arch {
@@ -431,6 +427,9 @@ struct kvm_vcpu_arch {
431 427
432 int cpuid_nent; 428 int cpuid_nent;
433 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; 429 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
430
431 int maxphyaddr;
432
434 /* emulate context */ 433 /* emulate context */
435 434
436 struct x86_emulate_ctxt emulate_ctxt; 435 struct x86_emulate_ctxt emulate_ctxt;
@@ -550,11 +549,20 @@ struct kvm_arch_memory_slot {
550 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; 549 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
551}; 550};
552 551
552/*
553 * We use as the mode the number of bits allocated in the LDR for the
554 * logical processor ID. It happens that these are all powers of two.
555 * This makes it is very easy to detect cases where the APICs are
556 * configured for multiple modes; in that case, we cannot use the map and
557 * hence cannot use kvm_irq_delivery_to_apic_fast either.
558 */
559#define KVM_APIC_MODE_XAPIC_CLUSTER 4
560#define KVM_APIC_MODE_XAPIC_FLAT 8
561#define KVM_APIC_MODE_X2APIC 16
562
553struct kvm_apic_map { 563struct kvm_apic_map {
554 struct rcu_head rcu; 564 struct rcu_head rcu;
555 u8 ldr_bits; 565 u8 mode;
556 /* fields bellow are used to decode ldr values in different modes */
557 u32 cid_shift, cid_mask, lid_mask, broadcast;
558 struct kvm_lapic *phys_map[256]; 566 struct kvm_lapic *phys_map[256];
559 /* first index is cluster id second is cpu id in a cluster */ 567 /* first index is cluster id second is cpu id in a cluster */
560 struct kvm_lapic *logical_map[16][16]; 568 struct kvm_lapic *logical_map[16][16];
@@ -859,6 +867,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
859void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 867void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
860void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 868void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
861 struct kvm_memory_slot *memslot); 869 struct kvm_memory_slot *memslot);
870void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
871 struct kvm_memory_slot *memslot);
862void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 872void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
863 struct kvm_memory_slot *memslot); 873 struct kvm_memory_slot *memslot);
864void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, 874void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -933,6 +943,7 @@ struct x86_emulate_ctxt;
933int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); 943int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
934void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 944void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
935int kvm_emulate_halt(struct kvm_vcpu *vcpu); 945int kvm_emulate_halt(struct kvm_vcpu *vcpu);
946int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
936int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); 947int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
937 948
938void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 949void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -1128,7 +1139,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1128int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 1139int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
1129int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 1140int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
1130void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 1141void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
1131int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
1132int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); 1142int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
1133int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1143int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
1134int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1144int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index e62cf897f781..c1adf33fdd0d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -115,7 +115,7 @@ static inline void kvm_spinlock_init(void)
115 115
116static inline bool kvm_para_available(void) 116static inline bool kvm_para_available(void)
117{ 117{
118 return 0; 118 return false;
119} 119}
120 120
121static inline unsigned int kvm_arch_para_features(void) 121static inline unsigned int kvm_arch_para_features(void)
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index a455a53d789a..2d29197bd2fb 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -32,8 +32,8 @@ static inline int klp_check_compiler_support(void)
32#endif 32#endif
33 return 0; 33 return 0;
34} 34}
35extern int klp_write_module_reloc(struct module *mod, unsigned long type, 35int klp_write_module_reloc(struct module *mod, unsigned long type,
36 unsigned long loc, unsigned long value); 36 unsigned long loc, unsigned long value);
37 37
38static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) 38static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
39{ 39{
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9b3de99dc004..1f5a86d518db 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -116,6 +116,12 @@ struct mca_config {
116 u32 rip_msr; 116 u32 rip_msr;
117}; 117};
118 118
119struct mce_vendor_flags {
120 __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
121 __reserved_0 : 63;
122};
123extern struct mce_vendor_flags mce_flags;
124
119extern struct mca_config mca_cfg; 125extern struct mca_config mca_cfg;
120extern void mce_register_decode_chain(struct notifier_block *nb); 126extern void mce_register_decode_chain(struct notifier_block *nb);
121extern void mce_unregister_decode_chain(struct notifier_block *nb); 127extern void mce_unregister_decode_chain(struct notifier_block *nb);
@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
128#ifdef CONFIG_X86_MCE 134#ifdef CONFIG_X86_MCE
129int mcheck_init(void); 135int mcheck_init(void);
130void mcheck_cpu_init(struct cpuinfo_x86 *c); 136void mcheck_cpu_init(struct cpuinfo_x86 *c);
137void mcheck_vendor_init_severity(void);
131#else 138#else
132static inline int mcheck_init(void) { return 0; } 139static inline int mcheck_init(void) { return 0; }
133static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} 140static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
141static inline void mcheck_vendor_init_severity(void) {}
134#endif 142#endif
135 143
136#ifdef CONFIG_X86_ANCIENT_MCE 144#ifdef CONFIG_X86_ANCIENT_MCE
@@ -183,11 +191,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
183DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 191DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
184 192
185enum mcp_flags { 193enum mcp_flags {
186 MCP_TIMESTAMP = (1 << 0), /* log time stamp */ 194 MCP_TIMESTAMP = BIT(0), /* log time stamp */
187 MCP_UC = (1 << 1), /* log uncorrected errors */ 195 MCP_UC = BIT(1), /* log uncorrected errors */
188 MCP_DONTLOG = (1 << 2), /* only clear, don't log */ 196 MCP_DONTLOG = BIT(2), /* only clear, don't log */
189}; 197};
190void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 198bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
191 199
192int mce_notify_irq(void); 200int mce_notify_irq(void);
193 201
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 201b520521ed..2fb20d6f7e23 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -75,6 +75,79 @@ static inline void __exit exit_amd_microcode(void) {}
75 75
76#ifdef CONFIG_MICROCODE_EARLY 76#ifdef CONFIG_MICROCODE_EARLY
77#define MAX_UCODE_COUNT 128 77#define MAX_UCODE_COUNT 128
78
79#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
80#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
81#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
82#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
83#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
84#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
85#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
86
87#define CPUID_IS(a, b, c, ebx, ecx, edx) \
88 (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
89
90/*
91 * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
92 * x86_vendor() gets vendor id for BSP.
93 *
94 * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
95 * coding, we still use x86_vendor() to get vendor id for AP.
96 *
97 * x86_vendor() gets vendor information directly from CPUID.
98 */
99static inline int x86_vendor(void)
100{
101 u32 eax = 0x00000000;
102 u32 ebx, ecx = 0, edx;
103
104 native_cpuid(&eax, &ebx, &ecx, &edx);
105
106 if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
107 return X86_VENDOR_INTEL;
108
109 if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
110 return X86_VENDOR_AMD;
111
112 return X86_VENDOR_UNKNOWN;
113}
114
115static inline unsigned int __x86_family(unsigned int sig)
116{
117 unsigned int x86;
118
119 x86 = (sig >> 8) & 0xf;
120
121 if (x86 == 0xf)
122 x86 += (sig >> 20) & 0xff;
123
124 return x86;
125}
126
127static inline unsigned int x86_family(void)
128{
129 u32 eax = 0x00000001;
130 u32 ebx, ecx = 0, edx;
131
132 native_cpuid(&eax, &ebx, &ecx, &edx);
133
134 return __x86_family(eax);
135}
136
137static inline unsigned int x86_model(unsigned int sig)
138{
139 unsigned int x86, model;
140
141 x86 = __x86_family(sig);
142
143 model = (sig >> 4) & 0xf;
144
145 if (x86 == 0x6 || x86 == 0xf)
146 model += ((sig >> 16) & 0xf) << 4;
147
148 return model;
149}
150
78extern void __init load_ucode_bsp(void); 151extern void __init load_ucode_bsp(void);
79extern void load_ucode_ap(void); 152extern void load_ucode_ap(void);
80extern int __init save_microcode_in_initrd(void); 153extern int __init save_microcode_in_initrd(void);
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index dd4c20043ce7..2b9209c46ca9 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -56,12 +56,15 @@ struct extended_sigtable {
56 56
57#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) 57#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
58 58
59extern int 59extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
60get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev);
61extern int microcode_sanity_check(void *mc, int print_err); 60extern int microcode_sanity_check(void *mc, int print_err);
62extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev); 61extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
63extern int 62
64update_match_revision(struct microcode_header_intel *mc_header, int rev); 63static inline int
64revision_is_newer(struct microcode_header_intel *mc_header, int rev)
65{
66 return (mc_header->rev <= rev) ? 0 : 1;
67}
65 68
66#ifdef CONFIG_MICROCODE_INTEL_EARLY 69#ifdef CONFIG_MICROCODE_INTEL_EARLY
67extern void __init load_ucode_intel_bsp(void); 70extern void __init load_ucode_intel_bsp(void);
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index a1410db38a1a..653dfa7662e1 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
30 :: "a" (eax), "c" (ecx)); 30 :: "a" (eax), "c" (ecx));
31} 31}
32 32
33static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
34{
35 trace_hardirqs_on();
36 /* "mwait %eax, %ecx;" */
37 asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
38 :: "a" (eax), "c" (ecx));
39}
40
33/* 41/*
34 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 42 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
35 * which can obviate IPI to trigger checking of need_resched. 43 * which can obviate IPI to trigger checking of need_resched.
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
40 40
41#ifdef CONFIG_X86_64 41#ifdef CONFIG_X86_64
42#include <asm/page_64_types.h> 42#include <asm/page_64_types.h>
43#define IOREMAP_MAX_ORDER (PUD_SHIFT)
43#else 44#else
44#include <asm/page_32_types.h> 45#include <asm/page_32_types.h>
46#define IOREMAP_MAX_ORDER (PMD_SHIFT)
45#endif /* CONFIG_X86_64 */ 47#endif /* CONFIG_X86_64 */
46 48
47#ifndef __ASSEMBLY__ 49#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 965c47d254aa..8957810ad7d1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
545 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); 545 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
546} 546}
547 547
548#if PAGETABLE_LEVELS >= 3 548#if CONFIG_PGTABLE_LEVELS >= 3
549static inline pmd_t __pmd(pmdval_t val) 549static inline pmd_t __pmd(pmdval_t val)
550{ 550{
551 pmdval_t ret; 551 pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
585 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, 585 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
586 val); 586 val);
587} 587}
588#if PAGETABLE_LEVELS == 4 588#if CONFIG_PGTABLE_LEVELS == 4
589static inline pud_t __pud(pudval_t val) 589static inline pud_t __pud(pudval_t val)
590{ 590{
591 pudval_t ret; 591 pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
636 set_pud(pudp, __pud(0)); 636 set_pud(pudp, __pud(0));
637} 637}
638 638
639#endif /* PAGETABLE_LEVELS == 4 */ 639#endif /* CONFIG_PGTABLE_LEVELS == 4 */
640 640
641#endif /* PAGETABLE_LEVELS >= 3 */ 641#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
642 642
643#ifdef CONFIG_X86_PAE 643#ifdef CONFIG_X86_PAE
644/* Special-case pte-setting operations for PAE, which can't update a 644/* Special-case pte-setting operations for PAE, which can't update a
@@ -976,11 +976,6 @@ extern void default_banner(void);
976 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ 976 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
977 CLBR_NONE, \ 977 CLBR_NONE, \
978 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) 978 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
979
980#define ENABLE_INTERRUPTS_SYSEXIT32 \
981 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
982 CLBR_NONE, \
983 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
984#endif /* CONFIG_X86_32 */ 979#endif /* CONFIG_X86_32 */
985 980
986#endif /* __ASSEMBLY__ */ 981#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7549b8b369e4..f7b0b5c112f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,7 +294,7 @@ struct pv_mmu_ops {
294 struct paravirt_callee_save pgd_val; 294 struct paravirt_callee_save pgd_val;
295 struct paravirt_callee_save make_pgd; 295 struct paravirt_callee_save make_pgd;
296 296
297#if PAGETABLE_LEVELS >= 3 297#if CONFIG_PGTABLE_LEVELS >= 3
298#ifdef CONFIG_X86_PAE 298#ifdef CONFIG_X86_PAE
299 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); 299 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
300 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, 300 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -308,13 +308,13 @@ struct pv_mmu_ops {
308 struct paravirt_callee_save pmd_val; 308 struct paravirt_callee_save pmd_val;
309 struct paravirt_callee_save make_pmd; 309 struct paravirt_callee_save make_pmd;
310 310
311#if PAGETABLE_LEVELS == 4 311#if CONFIG_PGTABLE_LEVELS == 4
312 struct paravirt_callee_save pud_val; 312 struct paravirt_callee_save pud_val;
313 struct paravirt_callee_save make_pud; 313 struct paravirt_callee_save make_pud;
314 314
315 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); 315 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
316#endif /* PAGETABLE_LEVELS == 4 */ 316#endif /* CONFIG_PGTABLE_LEVELS == 4 */
317#endif /* PAGETABLE_LEVELS >= 3 */ 317#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
318 318
319 struct pv_lazy_ops lazy_mode; 319 struct pv_lazy_ops lazy_mode;
320 320
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
77 77
78#define pmd_pgtable(pmd) pmd_page(pmd) 78#define pmd_pgtable(pmd) pmd_page(pmd)
79 79
80#if PAGETABLE_LEVELS > 2 80#if CONFIG_PGTABLE_LEVELS > 2
81static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 81static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
82{ 82{
83 struct page *page; 83 struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
116} 116}
117#endif /* CONFIG_X86_PAE */ 117#endif /* CONFIG_X86_PAE */
118 118
119#if PAGETABLE_LEVELS > 3 119#if CONFIG_PGTABLE_LEVELS > 3
120static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) 120static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
121{ 121{
122 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); 122 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
142 ___pud_free_tlb(tlb, pud); 142 ___pud_free_tlb(tlb, pud);
143} 143}
144 144
145#endif /* PAGETABLE_LEVELS > 3 */ 145#endif /* CONFIG_PGTABLE_LEVELS > 3 */
146#endif /* PAGETABLE_LEVELS > 2 */ 146#endif /* CONFIG_PGTABLE_LEVELS > 2 */
147 147
148#endif /* _ASM_X86_PGALLOC_H */ 148#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
17#endif /* !__ASSEMBLY__ */ 17#endif /* !__ASSEMBLY__ */
18 18
19#define SHARED_KERNEL_PMD 0 19#define SHARED_KERNEL_PMD 0
20#define PAGETABLE_LEVELS 2
21 20
22/* 21/*
23 * traditional i386 two-level paging structure: 22 * traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
24#define SHARED_KERNEL_PMD 1 24#define SHARED_KERNEL_PMD 1
25#endif 25#endif
26 26
27#define PAGETABLE_LEVELS 3
28
29/* 27/*
30 * PGDIR_SHIFT determines what a top-level page table entry can map 28 * PGDIR_SHIFT determines what a top-level page table entry can map
31 */ 29 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..fe57e7a98839 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
551 return npg >> (20 - PAGE_SHIFT); 551 return npg >> (20 - PAGE_SHIFT);
552} 552}
553 553
554#if PAGETABLE_LEVELS > 2 554#if CONFIG_PGTABLE_LEVELS > 2
555static inline int pud_none(pud_t pud) 555static inline int pud_none(pud_t pud)
556{ 556{
557 return native_pud_val(pud) == 0; 557 return native_pud_val(pud) == 0;
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud)
594{ 594{
595 return 0; 595 return 0;
596} 596}
597#endif /* PAGETABLE_LEVELS > 2 */ 597#endif /* CONFIG_PGTABLE_LEVELS > 2 */
598 598
599#if PAGETABLE_LEVELS > 3 599#if CONFIG_PGTABLE_LEVELS > 3
600static inline int pgd_present(pgd_t pgd) 600static inline int pgd_present(pgd_t pgd)
601{ 601{
602 return pgd_flags(pgd) & _PAGE_PRESENT; 602 return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd)
633{ 633{
634 return !native_pgd_val(pgd); 634 return !native_pgd_val(pgd);
635} 635}
636#endif /* PAGETABLE_LEVELS > 3 */ 636#endif /* CONFIG_PGTABLE_LEVELS > 3 */
637 637
638#endif /* __ASSEMBLY__ */ 638#endif /* __ASSEMBLY__ */
639 639
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
20#endif /* !__ASSEMBLY__ */ 20#endif /* !__ASSEMBLY__ */
21 21
22#define SHARED_KERNEL_PMD 0 22#define SHARED_KERNEL_PMD 0
23#define PAGETABLE_LEVELS 4
24 23
25/* 24/*
26 * PGDIR_SHIFT determines what a top-level page table entry can map 25 * PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
234 return native_pgd_val(pgd) & PTE_FLAGS_MASK; 234 return native_pgd_val(pgd) & PTE_FLAGS_MASK;
235} 235}
236 236
237#if PAGETABLE_LEVELS > 3 237#if CONFIG_PGTABLE_LEVELS > 3
238typedef struct { pudval_t pud; } pud_t; 238typedef struct { pudval_t pud; } pud_t;
239 239
240static inline pud_t native_make_pud(pmdval_t val) 240static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
255} 255}
256#endif 256#endif
257 257
258#if PAGETABLE_LEVELS > 2 258#if CONFIG_PGTABLE_LEVELS > 2
259typedef struct { pmdval_t pmd; } pmd_t; 259typedef struct { pmdval_t pmd; } pmd_t;
260 260
261static inline pmd_t native_make_pmd(pmdval_t val) 261static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/pm-trace.h
index 3ff1c2cb1da5..7b7ac42c3661 100644
--- a/arch/x86/include/asm/resume-trace.h
+++ b/arch/x86/include/asm/pm-trace.h
@@ -1,5 +1,5 @@
1#ifndef _ASM_X86_RESUME_TRACE_H 1#ifndef _ASM_X86_PM_TRACE_H
2#define _ASM_X86_RESUME_TRACE_H 2#define _ASM_X86_PM_TRACE_H
3 3
4#include <asm/asm.h> 4#include <asm/asm.h>
5 5
@@ -14,8 +14,10 @@ do { \
14 ".previous" \ 14 ".previous" \
15 :"=r" (tracedata) \ 15 :"=r" (tracedata) \
16 : "i" (__LINE__), "i" (__FILE__)); \ 16 : "i" (__LINE__), "i" (__FILE__)); \
17 generate_resume_trace(tracedata, user); \ 17 generate_pm_trace(tracedata, user); \
18 } \ 18 } \
19} while (0) 19} while (0)
20 20
21#endif /* _ASM_X86_RESUME_TRACE_H */ 21#define TRACE_SUSPEND(user) TRACE_RESUME(user)
22
23#endif /* _ASM_X86_PM_TRACE_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a12d50e04d7a..23ba6765b718 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -213,8 +213,23 @@ struct x86_hw_tss {
213 unsigned long sp0; 213 unsigned long sp0;
214 unsigned short ss0, __ss0h; 214 unsigned short ss0, __ss0h;
215 unsigned long sp1; 215 unsigned long sp1;
216 /* ss1 caches MSR_IA32_SYSENTER_CS: */ 216
217 unsigned short ss1, __ss1h; 217 /*
218 * We don't use ring 1, so ss1 is a convenient scratch space in
219 * the same cacheline as sp0. We use ss1 to cache the value in
220 * MSR_IA32_SYSENTER_CS. When we context switch
221 * MSR_IA32_SYSENTER_CS, we first check if the new value being
222 * written matches ss1, and, if it's not, then we wrmsr the new
223 * value and update ss1.
224 *
225 * The only reason we context switch MSR_IA32_SYSENTER_CS is
226 * that we set it to zero in vm86 tasks to avoid corrupting the
227 * stack if we were to go through the sysenter path from vm86
228 * mode.
229 */
230 unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
231
232 unsigned short __ss1h;
218 unsigned long sp2; 233 unsigned long sp2;
219 unsigned short ss2, __ss2h; 234 unsigned short ss2, __ss2h;
220 unsigned long __cr3; 235 unsigned long __cr3;
@@ -279,13 +294,17 @@ struct tss_struct {
279 unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; 294 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
280 295
281 /* 296 /*
282 * .. and then another 0x100 bytes for the emergency kernel stack: 297 * Space for the temporary SYSENTER stack:
283 */ 298 */
284 unsigned long stack[64]; 299 unsigned long SYSENTER_stack[64];
285 300
286} ____cacheline_aligned; 301} ____cacheline_aligned;
287 302
288DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); 303DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
304
305#ifdef CONFIG_X86_32
306DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
307#endif
289 308
290/* 309/*
291 * Save the original ist values for checking stack pointers during debugging 310 * Save the original ist values for checking stack pointers during debugging
@@ -477,7 +496,6 @@ struct thread_struct {
477#ifdef CONFIG_X86_32 496#ifdef CONFIG_X86_32
478 unsigned long sysenter_cs; 497 unsigned long sysenter_cs;
479#else 498#else
480 unsigned long usersp; /* Copy from PDA */
481 unsigned short es; 499 unsigned short es;
482 unsigned short ds; 500 unsigned short ds;
483 unsigned short fsindex; 501 unsigned short fsindex;
@@ -567,6 +585,16 @@ static inline void native_swapgs(void)
567#endif 585#endif
568} 586}
569 587
588static inline unsigned long current_top_of_stack(void)
589{
590#ifdef CONFIG_X86_64
591 return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
592#else
593 /* sp0 on x86_32 is special in and around vm86 mode. */
594 return this_cpu_read_stable(cpu_current_top_of_stack);
595#endif
596}
597
570#ifdef CONFIG_PARAVIRT 598#ifdef CONFIG_PARAVIRT
571#include <asm/paravirt.h> 599#include <asm/paravirt.h>
572#else 600#else
@@ -764,10 +792,10 @@ extern char ignore_fpu_irq;
764#define ARCH_HAS_SPINLOCK_PREFETCH 792#define ARCH_HAS_SPINLOCK_PREFETCH
765 793
766#ifdef CONFIG_X86_32 794#ifdef CONFIG_X86_32
767# define BASE_PREFETCH ASM_NOP4 795# define BASE_PREFETCH ""
768# define ARCH_HAS_PREFETCH 796# define ARCH_HAS_PREFETCH
769#else 797#else
770# define BASE_PREFETCH "prefetcht0 (%1)" 798# define BASE_PREFETCH "prefetcht0 %P1"
771#endif 799#endif
772 800
773/* 801/*
@@ -778,10 +806,9 @@ extern char ignore_fpu_irq;
778 */ 806 */
779static inline void prefetch(const void *x) 807static inline void prefetch(const void *x)
780{ 808{
781 alternative_input(BASE_PREFETCH, 809 alternative_input(BASE_PREFETCH, "prefetchnta %P1",
782 "prefetchnta (%1)",
783 X86_FEATURE_XMM, 810 X86_FEATURE_XMM,
784 "r" (x)); 811 "m" (*(const char *)x));
785} 812}
786 813
787/* 814/*
@@ -791,10 +818,9 @@ static inline void prefetch(const void *x)
791 */ 818 */
792static inline void prefetchw(const void *x) 819static inline void prefetchw(const void *x)
793{ 820{
794 alternative_input(BASE_PREFETCH, 821 alternative_input(BASE_PREFETCH, "prefetchw %P1",
795 "prefetchw (%1)", 822 X86_FEATURE_3DNOWPREFETCH,
796 X86_FEATURE_3DNOW, 823 "m" (*(const char *)x));
797 "r" (x));
798} 824}
799 825
800static inline void spin_lock_prefetch(const void *x) 826static inline void spin_lock_prefetch(const void *x)
@@ -802,6 +828,9 @@ static inline void spin_lock_prefetch(const void *x)
802 prefetchw(x); 828 prefetchw(x);
803} 829}
804 830
831#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
832 TOP_OF_KERNEL_STACK_PADDING)
833
805#ifdef CONFIG_X86_32 834#ifdef CONFIG_X86_32
806/* 835/*
807 * User space process size: 3GB (default). 836 * User space process size: 3GB (default).
@@ -812,39 +841,16 @@ static inline void spin_lock_prefetch(const void *x)
812#define STACK_TOP_MAX STACK_TOP 841#define STACK_TOP_MAX STACK_TOP
813 842
814#define INIT_THREAD { \ 843#define INIT_THREAD { \
815 .sp0 = sizeof(init_stack) + (long)&init_stack, \ 844 .sp0 = TOP_OF_INIT_STACK, \
816 .vm86_info = NULL, \ 845 .vm86_info = NULL, \
817 .sysenter_cs = __KERNEL_CS, \ 846 .sysenter_cs = __KERNEL_CS, \
818 .io_bitmap_ptr = NULL, \ 847 .io_bitmap_ptr = NULL, \
819} 848}
820 849
821/*
822 * Note that the .io_bitmap member must be extra-big. This is because
823 * the CPU will access an additional byte beyond the end of the IO
824 * permission bitmap. The extra byte must be all 1 bits, and must
825 * be within the limit.
826 */
827#define INIT_TSS { \
828 .x86_tss = { \
829 .sp0 = sizeof(init_stack) + (long)&init_stack, \
830 .ss0 = __KERNEL_DS, \
831 .ss1 = __KERNEL_CS, \
832 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
833 }, \
834 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
835}
836
837extern unsigned long thread_saved_pc(struct task_struct *tsk); 850extern unsigned long thread_saved_pc(struct task_struct *tsk);
838 851
839#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
840#define KSTK_TOP(info) \
841({ \
842 unsigned long *__ptr = (unsigned long *)(info); \
843 (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
844})
845
846/* 852/*
847 * The below -8 is to reserve 8 bytes on top of the ring0 stack. 853 * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
848 * This is necessary to guarantee that the entire "struct pt_regs" 854 * This is necessary to guarantee that the entire "struct pt_regs"
849 * is accessible even if the CPU haven't stored the SS/ESP registers 855 * is accessible even if the CPU haven't stored the SS/ESP registers
850 * on the stack (interrupt gate does not save these registers 856 * on the stack (interrupt gate does not save these registers
@@ -853,11 +859,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
853 * "struct pt_regs" is possible, but they may contain the 859 * "struct pt_regs" is possible, but they may contain the
854 * completely wrong values. 860 * completely wrong values.
855 */ 861 */
856#define task_pt_regs(task) \ 862#define task_pt_regs(task) \
857({ \ 863({ \
858 struct pt_regs *__regs__; \ 864 unsigned long __ptr = (unsigned long)task_stack_page(task); \
859 __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ 865 __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
860 __regs__ - 1; \ 866 ((struct pt_regs *)__ptr) - 1; \
861}) 867})
862 868
863#define KSTK_ESP(task) (task_pt_regs(task)->sp) 869#define KSTK_ESP(task) (task_pt_regs(task)->sp)
@@ -889,11 +895,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
889#define STACK_TOP_MAX TASK_SIZE_MAX 895#define STACK_TOP_MAX TASK_SIZE_MAX
890 896
891#define INIT_THREAD { \ 897#define INIT_THREAD { \
892 .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ 898 .sp0 = TOP_OF_INIT_STACK \
893}
894
895#define INIT_TSS { \
896 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
897} 899}
898 900
899/* 901/*
@@ -905,11 +907,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
905#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) 907#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
906extern unsigned long KSTK_ESP(struct task_struct *task); 908extern unsigned long KSTK_ESP(struct task_struct *task);
907 909
908/*
909 * User space RSP while inside the SYSCALL fast path
910 */
911DECLARE_PER_CPU(unsigned long, old_rsp);
912
913#endif /* CONFIG_X86_64 */ 910#endif /* CONFIG_X86_64 */
914 911
915extern void start_thread(struct pt_regs *regs, unsigned long new_ip, 912extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 86fc2bb82287..19507ffa5d28 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@ struct pt_regs {
31#else /* __i386__ */ 31#else /* __i386__ */
32 32
33struct pt_regs { 33struct pt_regs {
34/*
35 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
36 * unless syscall needs a complete, fully filled "struct pt_regs".
37 */
34 unsigned long r15; 38 unsigned long r15;
35 unsigned long r14; 39 unsigned long r14;
36 unsigned long r13; 40 unsigned long r13;
37 unsigned long r12; 41 unsigned long r12;
38 unsigned long bp; 42 unsigned long bp;
39 unsigned long bx; 43 unsigned long bx;
40/* arguments: non interrupts/non tracing syscalls only save up to here*/ 44/* These regs are callee-clobbered. Always saved on kernel entry. */
41 unsigned long r11; 45 unsigned long r11;
42 unsigned long r10; 46 unsigned long r10;
43 unsigned long r9; 47 unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
47 unsigned long dx; 51 unsigned long dx;
48 unsigned long si; 52 unsigned long si;
49 unsigned long di; 53 unsigned long di;
54/*
55 * On syscall entry, this is syscall#. On CPU exception, this is error code.
56 * On hw interrupt, it's IRQ number:
57 */
50 unsigned long orig_ax; 58 unsigned long orig_ax;
51/* end of arguments */ 59/* Return frame for iretq */
52/* cpu exception frame or undefined */
53 unsigned long ip; 60 unsigned long ip;
54 unsigned long cs; 61 unsigned long cs;
55 unsigned long flags; 62 unsigned long flags;
@@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
89} 96}
90 97
91/* 98/*
92 * user_mode_vm(regs) determines whether a register set came from user mode. 99 * user_mode(regs) determines whether a register set came from user
93 * This is true if V8086 mode was enabled OR if the register set was from 100 * mode. On x86_32, this is true if V8086 mode was enabled OR if the
94 * protected mode with RPL-3 CS value. This tricky test checks that with 101 * register set was from protected mode with RPL-3 CS value. This
95 * one comparison. Many places in the kernel can bypass this full check 102 * tricky test checks that with one comparison.
96 * if they have already ruled out V8086 mode, so user_mode(regs) can be used. 103 *
104 * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
105 * the extra check.
97 */ 106 */
98static inline int user_mode(struct pt_regs *regs) 107static inline int user_mode(struct pt_regs *regs)
99{ 108{
@@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs)
104#endif 113#endif
105} 114}
106 115
107static inline int user_mode_vm(struct pt_regs *regs)
108{
109#ifdef CONFIG_X86_32
110 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
111 USER_RPL;
112#else
113 return user_mode(regs);
114#endif
115}
116
117static inline int v8086_mode(struct pt_regs *regs) 116static inline int v8086_mode(struct pt_regs *regs)
118{ 117{
119#ifdef CONFIG_X86_32 118#ifdef CONFIG_X86_32
@@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
138#endif 137#endif
139} 138}
140 139
141#define current_user_stack_pointer() this_cpu_read(old_rsp) 140#define current_user_stack_pointer() current_pt_regs()->sp
142/* ia32 vs. x32 difference */ 141#define compat_user_stack_pointer() current_pt_regs()->sp
143#define compat_user_stack_pointer() \
144 (test_thread_flag(TIF_IA32) \
145 ? current_pt_regs()->sp \
146 : this_cpu_read(old_rsp))
147#endif 142#endif
148 143
149#ifdef CONFIG_X86_32 144#ifdef CONFIG_X86_32
@@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
248 */ 243 */
249#define arch_ptrace_stop_needed(code, info) \ 244#define arch_ptrace_stop_needed(code, info) \
250({ \ 245({ \
251 set_thread_flag(TIF_NOTIFY_RESUME); \ 246 force_iret(); \
252 false; \ 247 false; \
253}) 248})
254 249
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e9fa28..25b1cc07d496 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
95 95
96struct pvclock_vsyscall_time_info { 96struct pvclock_vsyscall_time_info {
97 struct pvclock_vcpu_time_info pvti; 97 struct pvclock_vcpu_time_info pvti;
98 u32 migrate_count;
98} __attribute__((__aligned__(SMP_CACHE_BYTES))); 99} __attribute__((__aligned__(SMP_CACHE_BYTES)));
99 100
100#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) 101#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 0f3d7f099224..0c8c7c8861b4 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -1,5 +1,20 @@
1#ifndef _ASM_X86_SECCOMP_H
2#define _ASM_X86_SECCOMP_H
3
4#include <asm/unistd.h>
5
1#ifdef CONFIG_X86_32 6#ifdef CONFIG_X86_32
2# include <asm/seccomp_32.h> 7#define __NR_seccomp_sigreturn __NR_sigreturn
3#else
4# include <asm/seccomp_64.h>
5#endif 8#endif
9
10#ifdef CONFIG_COMPAT
11#include <asm/ia32_unistd.h>
12#define __NR_seccomp_read_32 __NR_ia32_read
13#define __NR_seccomp_write_32 __NR_ia32_write
14#define __NR_seccomp_exit_32 __NR_ia32_exit
15#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
16#endif
17
18#include <asm-generic/seccomp.h>
19
20#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
deleted file mode 100644
index b811d6f5780c..000000000000
--- a/arch/x86/include/asm/seccomp_32.h
+++ /dev/null
@@ -1,11 +0,0 @@
1#ifndef _ASM_X86_SECCOMP_32_H
2#define _ASM_X86_SECCOMP_32_H
3
4#include <linux/unistd.h>
5
6#define __NR_seccomp_read __NR_read
7#define __NR_seccomp_write __NR_write
8#define __NR_seccomp_exit __NR_exit
9#define __NR_seccomp_sigreturn __NR_sigreturn
10
11#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
deleted file mode 100644
index 84ec1bd161a5..000000000000
--- a/arch/x86/include/asm/seccomp_64.h
+++ /dev/null
@@ -1,17 +0,0 @@
1#ifndef _ASM_X86_SECCOMP_64_H
2#define _ASM_X86_SECCOMP_64_H
3
4#include <linux/unistd.h>
5#include <asm/ia32_unistd.h>
6
7#define __NR_seccomp_read __NR_read
8#define __NR_seccomp_write __NR_write
9#define __NR_seccomp_exit __NR_exit
10#define __NR_seccomp_sigreturn __NR_rt_sigreturn
11
12#define __NR_seccomp_read_32 __NR_ia32_read
13#define __NR_seccomp_write_32 __NR_ia32_write
14#define __NR_seccomp_exit_32 __NR_ia32_exit
15#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
16
17#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index db257a58571f..5a9856eb12ba 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -3,8 +3,10 @@
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5 5
6/* Constructor for a conventional segment GDT (or LDT) entry */ 6/*
7/* This is a macro so it can be used in initializers */ 7 * Constructor for a conventional segment GDT (or LDT) entry.
8 * This is a macro so it can be used in initializers.
9 */
8#define GDT_ENTRY(flags, base, limit) \ 10#define GDT_ENTRY(flags, base, limit) \
9 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ 11 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
10 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ 12 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
@@ -12,198 +14,228 @@
12 (((base) & _AC(0x00ffffff,ULL)) << 16) | \ 14 (((base) & _AC(0x00ffffff,ULL)) << 16) | \
13 (((limit) & _AC(0x0000ffff,ULL)))) 15 (((limit) & _AC(0x0000ffff,ULL))))
14 16
15/* Simple and small GDT entries for booting only */ 17/* Simple and small GDT entries for booting only: */
16 18
17#define GDT_ENTRY_BOOT_CS 2 19#define GDT_ENTRY_BOOT_CS 2
18#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) 20#define GDT_ENTRY_BOOT_DS 3
21#define GDT_ENTRY_BOOT_TSS 4
22#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8)
23#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8)
24#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8)
25
26/*
27 * Bottom two bits of selector give the ring
28 * privilege level
29 */
30#define SEGMENT_RPL_MASK 0x3
19 31
20#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) 32/* User mode is privilege level 3: */
21#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) 33#define USER_RPL 0x3
22 34
23#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) 35/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
24#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) 36#define SEGMENT_TI_MASK 0x4
37/* LDT segment has TI set ... */
38#define SEGMENT_LDT 0x4
39/* ... GDT has it cleared */
40#define SEGMENT_GDT 0x0
25 41
26#define SEGMENT_RPL_MASK 0x3 /* 42#define GDT_ENTRY_INVALID_SEG 0
27 * Bottom two bits of selector give the ring
28 * privilege level
29 */
30#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */
31#define USER_RPL 0x3 /* User mode is privilege level 3 */
32#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */
33#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */
34 43
35#ifdef CONFIG_X86_32 44#ifdef CONFIG_X86_32
36/* 45/*
37 * The layout of the per-CPU GDT under Linux: 46 * The layout of the per-CPU GDT under Linux:
38 * 47 *
39 * 0 - null 48 * 0 - null <=== cacheline #1
40 * 1 - reserved 49 * 1 - reserved
41 * 2 - reserved 50 * 2 - reserved
42 * 3 - reserved 51 * 3 - reserved
43 * 52 *
44 * 4 - unused <==== new cacheline 53 * 4 - unused <=== cacheline #2
45 * 5 - unused 54 * 5 - unused
46 * 55 *
47 * ------- start of TLS (Thread-Local Storage) segments: 56 * ------- start of TLS (Thread-Local Storage) segments:
48 * 57 *
49 * 6 - TLS segment #1 [ glibc's TLS segment ] 58 * 6 - TLS segment #1 [ glibc's TLS segment ]
50 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] 59 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
51 * 8 - TLS segment #3 60 * 8 - TLS segment #3 <=== cacheline #3
52 * 9 - reserved 61 * 9 - reserved
53 * 10 - reserved 62 * 10 - reserved
54 * 11 - reserved 63 * 11 - reserved
55 * 64 *
56 * ------- start of kernel segments: 65 * ------- start of kernel segments:
57 * 66 *
58 * 12 - kernel code segment <==== new cacheline 67 * 12 - kernel code segment <=== cacheline #4
59 * 13 - kernel data segment 68 * 13 - kernel data segment
60 * 14 - default user CS 69 * 14 - default user CS
61 * 15 - default user DS 70 * 15 - default user DS
62 * 16 - TSS 71 * 16 - TSS <=== cacheline #5
63 * 17 - LDT 72 * 17 - LDT
64 * 18 - PNPBIOS support (16->32 gate) 73 * 18 - PNPBIOS support (16->32 gate)
65 * 19 - PNPBIOS support 74 * 19 - PNPBIOS support
66 * 20 - PNPBIOS support 75 * 20 - PNPBIOS support <=== cacheline #6
67 * 21 - PNPBIOS support 76 * 21 - PNPBIOS support
68 * 22 - PNPBIOS support 77 * 22 - PNPBIOS support
69 * 23 - APM BIOS support 78 * 23 - APM BIOS support
70 * 24 - APM BIOS support 79 * 24 - APM BIOS support <=== cacheline #7
71 * 25 - APM BIOS support 80 * 25 - APM BIOS support
72 * 81 *
73 * 26 - ESPFIX small SS 82 * 26 - ESPFIX small SS
74 * 27 - per-cpu [ offset to per-cpu data area ] 83 * 27 - per-cpu [ offset to per-cpu data area ]
75 * 28 - stack_canary-20 [ for stack protector ] 84 * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8
76 * 29 - unused 85 * 29 - unused
77 * 30 - unused 86 * 30 - unused
78 * 31 - TSS for double fault handler 87 * 31 - TSS for double fault handler
79 */ 88 */
80#define GDT_ENTRY_TLS_MIN 6 89#define GDT_ENTRY_TLS_MIN 6
81#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) 90#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
82 91
92#define GDT_ENTRY_KERNEL_CS 12
93#define GDT_ENTRY_KERNEL_DS 13
83#define GDT_ENTRY_DEFAULT_USER_CS 14 94#define GDT_ENTRY_DEFAULT_USER_CS 14
84
85#define GDT_ENTRY_DEFAULT_USER_DS 15 95#define GDT_ENTRY_DEFAULT_USER_DS 15
96#define GDT_ENTRY_TSS 16
97#define GDT_ENTRY_LDT 17
98#define GDT_ENTRY_PNPBIOS_CS32 18
99#define GDT_ENTRY_PNPBIOS_CS16 19
100#define GDT_ENTRY_PNPBIOS_DS 20
101#define GDT_ENTRY_PNPBIOS_TS1 21
102#define GDT_ENTRY_PNPBIOS_TS2 22
103#define GDT_ENTRY_APMBIOS_BASE 23
104
105#define GDT_ENTRY_ESPFIX_SS 26
106#define GDT_ENTRY_PERCPU 27
107#define GDT_ENTRY_STACK_CANARY 28
108
109#define GDT_ENTRY_DOUBLEFAULT_TSS 31
86 110
87#define GDT_ENTRY_KERNEL_BASE (12) 111/*
112 * Number of entries in the GDT table:
113 */
114#define GDT_ENTRIES 32
88 115
89#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) 116/*
117 * Segment selector values corresponding to the above entries:
118 */
90 119
91#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) 120#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
121#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
122#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
123#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
124#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
92 125
93#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) 126/* segment for calling fn: */
94#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) 127#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8)
128/* code segment for BIOS: */
129#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8)
95 130
96#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) 131/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
97#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) 132#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32)
98 133
99#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) 134/* data segment for BIOS: */
100#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) 135#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8)
136/* transfer data segment: */
137#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8)
138/* another data segment: */
139#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8)
101 140
102#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15)
103#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
104#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) 142# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8)
105#else 143#else
106#define __KERNEL_PERCPU 0 144# define __KERNEL_PERCPU 0
107#endif 145#endif
108 146
109#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16)
110#ifdef CONFIG_CC_STACKPROTECTOR 147#ifdef CONFIG_CC_STACKPROTECTOR
111#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) 148# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
112#else 149#else
113#define __KERNEL_STACK_CANARY 0 150# define __KERNEL_STACK_CANARY 0
114#endif 151#endif
115 152
116#define GDT_ENTRY_DOUBLEFAULT_TSS 31 153#else /* 64-bit: */
117
118/*
119 * The GDT has 32 entries
120 */
121#define GDT_ENTRIES 32
122 154
123/* The PnP BIOS entries in the GDT */ 155#include <asm/cache.h>
124#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
125#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
126#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
127#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
128#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
129
130/* The PnP BIOS selectors */
131#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
132#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
133#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
134#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
135#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
136 156
157#define GDT_ENTRY_KERNEL32_CS 1
158#define GDT_ENTRY_KERNEL_CS 2
159#define GDT_ENTRY_KERNEL_DS 3
137 160
138/* 161/*
139 * Matching rules for certain types of segments. 162 * We cannot use the same code segment descriptor for user and kernel mode,
163 * not even in long flat mode, because of different DPL.
164 *
165 * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
166 * selectors:
167 *
168 * if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
169 * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
170 *
171 * ss = STAR.SYSRET_CS+8 (in either case)
172 *
173 * thus USER_DS should be between 32-bit and 64-bit code selectors:
140 */ 174 */
175#define GDT_ENTRY_DEFAULT_USER32_CS 4
176#define GDT_ENTRY_DEFAULT_USER_DS 5
177#define GDT_ENTRY_DEFAULT_USER_CS 6
141 178
142/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ 179/* Needs two entries */
143#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) 180#define GDT_ENTRY_TSS 8
144 181/* Needs two entries */
182#define GDT_ENTRY_LDT 10
145 183
146#else 184#define GDT_ENTRY_TLS_MIN 12
147#include <asm/cache.h> 185#define GDT_ENTRY_TLS_MAX 14
148
149#define GDT_ENTRY_KERNEL32_CS 1
150#define GDT_ENTRY_KERNEL_CS 2
151#define GDT_ENTRY_KERNEL_DS 3
152 186
153#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) 187/* Abused to load per CPU data from limit */
188#define GDT_ENTRY_PER_CPU 15
154 189
155/* 190/*
156 * we cannot use the same code segment descriptor for user and kernel 191 * Number of entries in the GDT table:
157 * -- not even in the long flat mode, because of different DPL /kkeil
158 * The segment offset needs to contain a RPL. Grr. -AK
159 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
160 */ 192 */
161#define GDT_ENTRY_DEFAULT_USER32_CS 4 193#define GDT_ENTRIES 16
162#define GDT_ENTRY_DEFAULT_USER_DS 5
163#define GDT_ENTRY_DEFAULT_USER_CS 6
164#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
165#define __USER32_DS __USER_DS
166
167#define GDT_ENTRY_TSS 8 /* needs two entries */
168#define GDT_ENTRY_LDT 10 /* needs two entries */
169#define GDT_ENTRY_TLS_MIN 12
170#define GDT_ENTRY_TLS_MAX 14
171
172#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
173#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
174 194
175/* TLS indexes for 64bit - hardcoded in arch_prctl */ 195/*
176#define FS_TLS 0 196 * Segment selector values corresponding to the above entries:
177#define GS_TLS 1 197 *
178 198 * Note, selectors also need to have a correct RPL,
179#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) 199 * expressed with the +3 value for user-space selectors:
180#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) 200 */
181 201#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8)
182#define GDT_ENTRIES 16 202#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
203#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
204#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
205#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
206#define __USER32_DS __USER_DS
207#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
208#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3)
209
210/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */
211#define FS_TLS 0
212#define GS_TLS 1
213
214#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
215#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
183 216
184#endif 217#endif
185 218
186#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
187#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
188#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
189#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
190#ifndef CONFIG_PARAVIRT 219#ifndef CONFIG_PARAVIRT
191#define get_kernel_rpl() 0 220# define get_kernel_rpl() 0
192#endif 221#endif
193 222
194#define IDT_ENTRIES 256 223#define IDT_ENTRIES 256
195#define NUM_EXCEPTION_VECTORS 32 224#define NUM_EXCEPTION_VECTORS 32
196/* Bitmask of exception vectors which push an error code on the stack */ 225
197#define EXCEPTION_ERRCODE_MASK 0x00027d00 226/* Bitmask of exception vectors which push an error code on the stack: */
198#define GDT_SIZE (GDT_ENTRIES * 8) 227#define EXCEPTION_ERRCODE_MASK 0x00027d00
199#define GDT_ENTRY_TLS_ENTRIES 3 228
200#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) 229#define GDT_SIZE (GDT_ENTRIES*8)
230#define GDT_ENTRY_TLS_ENTRIES 3
231#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
201 232
202#ifdef __KERNEL__ 233#ifdef __KERNEL__
203#ifndef __ASSEMBLY__ 234#ifndef __ASSEMBLY__
235
204extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; 236extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
205#ifdef CONFIG_TRACING 237#ifdef CONFIG_TRACING
206#define trace_early_idt_handlers early_idt_handlers 238# define trace_early_idt_handlers early_idt_handlers
207#endif 239#endif
208 240
209/* 241/*
@@ -228,37 +260,30 @@ do { \
228} while (0) 260} while (0)
229 261
230/* 262/*
231 * Save a segment register away 263 * Save a segment register away:
232 */ 264 */
233#define savesegment(seg, value) \ 265#define savesegment(seg, value) \
234 asm("mov %%" #seg ",%0":"=r" (value) : : "memory") 266 asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
235 267
236/* 268/*
237 * x86_32 user gs accessors. 269 * x86-32 user GS accessors:
238 */ 270 */
239#ifdef CONFIG_X86_32 271#ifdef CONFIG_X86_32
240#ifdef CONFIG_X86_32_LAZY_GS 272# ifdef CONFIG_X86_32_LAZY_GS
241#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) 273# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
242#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) 274# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
243#define task_user_gs(tsk) ((tsk)->thread.gs) 275# define task_user_gs(tsk) ((tsk)->thread.gs)
244#define lazy_save_gs(v) savesegment(gs, (v)) 276# define lazy_save_gs(v) savesegment(gs, (v))
245#define lazy_load_gs(v) loadsegment(gs, (v)) 277# define lazy_load_gs(v) loadsegment(gs, (v))
246#else /* X86_32_LAZY_GS */ 278# else /* X86_32_LAZY_GS */
247#define get_user_gs(regs) (u16)((regs)->gs) 279# define get_user_gs(regs) (u16)((regs)->gs)
248#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) 280# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
249#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) 281# define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
250#define lazy_save_gs(v) do { } while (0) 282# define lazy_save_gs(v) do { } while (0)
251#define lazy_load_gs(v) do { } while (0) 283# define lazy_load_gs(v) do { } while (0)
252#endif /* X86_32_LAZY_GS */ 284# endif /* X86_32_LAZY_GS */
253#endif /* X86_32 */ 285#endif /* X86_32 */
254 286
255static inline unsigned long get_limit(unsigned long segment)
256{
257 unsigned long __limit;
258 asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
259 return __limit + 1;
260}
261
262#endif /* !__ASSEMBLY__ */ 287#endif /* !__ASSEMBLY__ */
263#endif /* __KERNEL__ */ 288#endif /* __KERNEL__ */
264 289
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ff4e7b236e21..f69e06b283fb 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { }
66 */ 66 */
67extern struct boot_params boot_params; 67extern struct boot_params boot_params;
68 68
69static inline bool kaslr_enabled(void)
70{
71 return !!(boot_params.hdr.loadflags & KASLR_FLAG);
72}
73
69/* 74/*
70 * Do NOT EVER look at the BIOS memory size location. 75 * Do NOT EVER look at the BIOS memory size location.
71 * It does not work on many machines. 76 * It does not work on many machines.
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 9dfce4e0417d..6fe6b182c998 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,9 +57,9 @@ struct sigcontext {
57 unsigned long ip; 57 unsigned long ip;
58 unsigned long flags; 58 unsigned long flags;
59 unsigned short cs; 59 unsigned short cs;
60 unsigned short gs; 60 unsigned short __pad2; /* Was called gs, but was always zero. */
61 unsigned short fs; 61 unsigned short __pad1; /* Was called fs, but was always zero. */
62 unsigned short __pad0; 62 unsigned short ss;
63 unsigned long err; 63 unsigned long err;
64 unsigned long trapno; 64 unsigned long trapno;
65 unsigned long oldmask; 65 unsigned long oldmask;
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 7a958164088c..89db46752a8f 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,9 +13,7 @@
13 X86_EFLAGS_CF | X86_EFLAGS_RF) 13 X86_EFLAGS_CF | X86_EFLAGS_RF)
14 14
15void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 15void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
16 16int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
17int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
18 unsigned long *pax);
19int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 17int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
20 struct pt_regs *regs, unsigned long mask); 18 struct pt_regs *regs, unsigned long mask);
21 19
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f4e270..ba665ebd17bb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
27 27
28#ifdef CONFIG_X86_SMAP 28#ifdef CONFIG_X86_SMAP
29 29
30#define ASM_CLAC \ 30#define ASM_CLAC \
31 661: ASM_NOP3 ; \ 31 ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
32 .pushsection .altinstr_replacement, "ax" ; \ 32
33 662: __ASM_CLAC ; \ 33#define ASM_STAC \
34 .popsection ; \ 34 ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
35 .pushsection .altinstructions, "a" ; \
36 altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
37 .popsection
38
39#define ASM_STAC \
40 661: ASM_NOP3 ; \
41 .pushsection .altinstr_replacement, "ax" ; \
42 662: __ASM_STAC ; \
43 .popsection ; \
44 .pushsection .altinstructions, "a" ; \
45 altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
46 .popsection
47 35
48#else /* CONFIG_X86_SMAP */ 36#else /* CONFIG_X86_SMAP */
49 37
@@ -61,20 +49,20 @@
61static __always_inline void clac(void) 49static __always_inline void clac(void)
62{ 50{
63 /* Note: a barrier is implicit in alternative() */ 51 /* Note: a barrier is implicit in alternative() */
64 alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); 52 alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
65} 53}
66 54
67static __always_inline void stac(void) 55static __always_inline void stac(void)
68{ 56{
69 /* Note: a barrier is implicit in alternative() */ 57 /* Note: a barrier is implicit in alternative() */
70 alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); 58 alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
71} 59}
72 60
73/* These macros can be used in asm() statements */ 61/* These macros can be used in asm() statements */
74#define ASM_CLAC \ 62#define ASM_CLAC \
75 ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) 63 ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
76#define ASM_STAC \ 64#define ASM_STAC \
77 ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) 65 ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
78 66
79#else /* CONFIG_X86_SMAP */ 67#else /* CONFIG_X86_SMAP */
80 68
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..17a8dced12da 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,13 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
150} 150}
151 151
152void cpu_disable_common(void); 152void cpu_disable_common(void);
153void cpu_die_common(unsigned int cpu);
154void native_smp_prepare_boot_cpu(void); 153void native_smp_prepare_boot_cpu(void);
155void native_smp_prepare_cpus(unsigned int max_cpus); 154void native_smp_prepare_cpus(unsigned int max_cpus);
156void native_smp_cpus_done(unsigned int max_cpus); 155void native_smp_cpus_done(unsigned int max_cpus);
156void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
157int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); 157int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
158int native_cpu_disable(void); 158int native_cpu_disable(void);
159int common_cpu_die(unsigned int cpu);
159void native_cpu_die(unsigned int cpu); 160void native_cpu_die(unsigned int cpu);
160void native_play_dead(void); 161void native_play_dead(void);
161void play_dead_common(void); 162void play_dead_common(void);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 6a4b00fafb00..aeb4666e0c0a 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -4,6 +4,8 @@
4 4
5#ifdef __KERNEL__ 5#ifdef __KERNEL__
6 6
7#include <asm/nops.h>
8
7static inline void native_clts(void) 9static inline void native_clts(void)
8{ 10{
9 asm volatile("clts"); 11 asm volatile("clts");
@@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p)
199 "+m" (*(volatile char __force *)__p)); 201 "+m" (*(volatile char __force *)__p));
200} 202}
201 203
204static inline void clwb(volatile void *__p)
205{
206 volatile struct { char x[64]; } *p = __p;
207
208 asm volatile(ALTERNATIVE_2(
209 ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
210 ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
211 X86_FEATURE_CLFLUSHOPT,
212 ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */
213 X86_FEATURE_CLWB)
214 : [p] "+m" (*p)
215 : [pax] "a" (p));
216}
217
218static inline void pcommit_sfence(void)
219{
220 alternative(ASM_NOP7,
221 ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
222 "sfence",
223 X86_FEATURE_PCOMMIT);
224}
225
202#define nop() asm volatile ("nop") 226#define nop() asm volatile ("nop")
203 227
204 228
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f279a32..b4bdec3e9523 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -13,19 +13,44 @@
13#include <asm/types.h> 13#include <asm/types.h>
14 14
15/* 15/*
16 * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
17 * reserve at the top of the kernel stack. We do it because of a nasty
18 * 32-bit corner case. On x86_32, the hardware stack frame is
19 * variable-length. Except for vm86 mode, struct pt_regs assumes a
20 * maximum-length frame. If we enter from CPL 0, the top 8 bytes of
21 * pt_regs don't actually exist. Ordinarily this doesn't matter, but it
22 * does in at least one case:
23 *
24 * If we take an NMI early enough in SYSENTER, then we can end up with
25 * pt_regs that extends above sp0. On the way out, in the espfix code,
26 * we can read the saved SS value, but that value will be above sp0.
27 * Without this offset, that can result in a page fault. (We are
28 * careful that, in this case, the value we read doesn't matter.)
29 *
30 * In vm86 mode, the hardware frame is much longer still, but we neither
31 * access the extra members from NMI context, nor do we write such a
32 * frame at sp0 at all.
33 *
34 * x86_64 has a fixed-length stack frame.
35 */
36#ifdef CONFIG_X86_32
37# define TOP_OF_KERNEL_STACK_PADDING 8
38#else
39# define TOP_OF_KERNEL_STACK_PADDING 0
40#endif
41
42/*
16 * low level task data that entry.S needs immediate access to 43 * low level task data that entry.S needs immediate access to
17 * - this struct should fit entirely inside of one cache line 44 * - this struct should fit entirely inside of one cache line
18 * - this struct shares the supervisor stack pages 45 * - this struct shares the supervisor stack pages
19 */ 46 */
20#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
21struct task_struct; 48struct task_struct;
22struct exec_domain;
23#include <asm/processor.h> 49#include <asm/processor.h>
24#include <linux/atomic.h> 50#include <linux/atomic.h>
25 51
26struct thread_info { 52struct thread_info {
27 struct task_struct *task; /* main task structure */ 53 struct task_struct *task; /* main task structure */
28 struct exec_domain *exec_domain; /* execution domain */
29 __u32 flags; /* low level flags */ 54 __u32 flags; /* low level flags */
30 __u32 status; /* thread synchronous flags */ 55 __u32 status; /* thread synchronous flags */
31 __u32 cpu; /* current CPU */ 56 __u32 cpu; /* current CPU */
@@ -39,7 +64,6 @@ struct thread_info {
39#define INIT_THREAD_INFO(tsk) \ 64#define INIT_THREAD_INFO(tsk) \
40{ \ 65{ \
41 .task = &tsk, \ 66 .task = &tsk, \
42 .exec_domain = &default_exec_domain, \
43 .flags = 0, \ 67 .flags = 0, \
44 .cpu = 0, \ 68 .cpu = 0, \
45 .saved_preempt_count = INIT_PREEMPT_COUNT, \ 69 .saved_preempt_count = INIT_PREEMPT_COUNT, \
@@ -145,7 +169,6 @@ struct thread_info {
145#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) 169#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
146 170
147#define STACK_WARN (THREAD_SIZE/8) 171#define STACK_WARN (THREAD_SIZE/8)
148#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
149 172
150/* 173/*
151 * macros/functions for gaining access to the thread information structure 174 * macros/functions for gaining access to the thread information structure
@@ -158,10 +181,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
158 181
159static inline struct thread_info *current_thread_info(void) 182static inline struct thread_info *current_thread_info(void)
160{ 183{
161 struct thread_info *ti; 184 return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
162 ti = (void *)(this_cpu_read_stable(kernel_stack) +
163 KERNEL_STACK_OFFSET - THREAD_SIZE);
164 return ti;
165} 185}
166 186
167static inline unsigned long current_stack_pointer(void) 187static inline unsigned long current_stack_pointer(void)
@@ -177,16 +197,37 @@ static inline unsigned long current_stack_pointer(void)
177 197
178#else /* !__ASSEMBLY__ */ 198#else /* !__ASSEMBLY__ */
179 199
180/* how to get the thread information struct from ASM */ 200/* Load thread_info address into "reg" */
181#define GET_THREAD_INFO(reg) \ 201#define GET_THREAD_INFO(reg) \
182 _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ 202 _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
183 _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; 203 _ASM_SUB $(THREAD_SIZE),reg ;
184 204
185/* 205/*
186 * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in 206 * ASM operand which evaluates to a 'thread_info' address of
187 * a certain register (to be used in assembler memory operands). 207 * the current task, if it is known that "reg" is exactly "off"
208 * bytes below the top of the stack currently.
209 *
210 * ( The kernel stack's size is known at build time, it is usually
211 * 2 or 4 pages, and the bottom of the kernel stack contains
212 * the thread_info structure. So to access the thread_info very
213 * quickly from assembly code we can calculate down from the
214 * top of the kernel stack to the bottom, using constant,
215 * build-time calculations only. )
216 *
217 * For example, to fetch the current thread_info->flags value into %eax
218 * on x86-64 defconfig kernels, in syscall entry code where RSP is
219 * currently at exactly SIZEOF_PTREGS bytes away from the top of the
220 * stack:
221 *
222 * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
223 *
224 * will translate to:
225 *
226 * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax
227 *
228 * which is below the current RSP by almost 16K.
188 */ 229 */
189#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) 230#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
190 231
191#endif 232#endif
192 233
@@ -236,6 +277,16 @@ static inline bool is_ia32_task(void)
236#endif 277#endif
237 return false; 278 return false;
238} 279}
280
281/*
282 * Force syscall return via IRET by making it look as if there was
283 * some work pending. IRET is our most capable (but slowest) syscall
284 * return path, which is able to restore modified SS, CS and certain
285 * EFLAGS values that other (fast) syscall return instructions
286 * are not able to restore properly.
287 */
288#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
289
239#endif /* !__ASSEMBLY__ */ 290#endif /* !__ASSEMBLY__ */
240 291
241#ifndef __ASSEMBLY__ 292#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 12a26b979bf1..f2f9b39b274a 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -231,6 +231,6 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
231} 231}
232 232
233unsigned long 233unsigned long
234copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); 234copy_user_handle_tail(char *to, char *from, unsigned len);
235 235
236#endif /* _ASM_X86_UACCESS_64_H */ 236#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 225b0988043a..ab456dc233b5 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -15,6 +15,7 @@
15 15
16/* loadflags */ 16/* loadflags */
17#define LOADED_HIGH (1<<0) 17#define LOADED_HIGH (1<<0)
18#define KASLR_FLAG (1<<1)
18#define QUIET_FLAG (1<<5) 19#define QUIET_FLAG (1<<5)
19#define KEEP_SEGMENTS (1<<6) 20#define KEEP_SEGMENTS (1<<6)
20#define CAN_USE_HEAP (1<<7) 21#define CAN_USE_HEAP (1<<7)
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a88851..580aee3072e0 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
25#else /* __i386__ */ 25#else /* __i386__ */
26 26
27#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) 27#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
28/*
29 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
30 * unless syscall needs a complete, fully filled "struct pt_regs".
31 */
28#define R15 0 32#define R15 0
29#define R14 8 33#define R14 8
30#define R13 16 34#define R13 16
31#define R12 24 35#define R12 24
32#define RBP 32 36#define RBP 32
33#define RBX 40 37#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save up to here*/ 38/* These regs are callee-clobbered. Always saved on kernel entry. */
35#define R11 48 39#define R11 48
36#define R10 56 40#define R10 56
37#define R9 64 41#define R9 64
@@ -41,15 +45,17 @@
41#define RDX 96 45#define RDX 96
42#define RSI 104 46#define RSI 104
43#define RDI 112 47#define RDI 112
44#define ORIG_RAX 120 /* = ERROR */ 48/*
45/* end of arguments */ 49 * On syscall entry, this is syscall#. On CPU exception, this is error code.
46/* cpu exception frame or undefined in case of fast syscall. */ 50 * On hw interrupt, it's IRQ number:
51 */
52#define ORIG_RAX 120
53/* Return frame for iretq */
47#define RIP 128 54#define RIP 128
48#define CS 136 55#define CS 136
49#define EFLAGS 144 56#define EFLAGS 144
50#define RSP 152 57#define RSP 152
51#define SS 160 58#define SS 160
52#define ARGOFFSET R11
53#endif /* __ASSEMBLY__ */ 59#endif /* __ASSEMBLY__ */
54 60
55/* top of stack page */ 61/* top of stack page */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa4d999..bc16115af39b 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@ struct pt_regs {
41#ifndef __KERNEL__ 41#ifndef __KERNEL__
42 42
43struct pt_regs { 43struct pt_regs {
44/*
45 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
46 * unless syscall needs a complete, fully filled "struct pt_regs".
47 */
44 unsigned long r15; 48 unsigned long r15;
45 unsigned long r14; 49 unsigned long r14;
46 unsigned long r13; 50 unsigned long r13;
47 unsigned long r12; 51 unsigned long r12;
48 unsigned long rbp; 52 unsigned long rbp;
49 unsigned long rbx; 53 unsigned long rbx;
50/* arguments: non interrupts/non tracing syscalls only save up to here*/ 54/* These regs are callee-clobbered. Always saved on kernel entry. */
51 unsigned long r11; 55 unsigned long r11;
52 unsigned long r10; 56 unsigned long r10;
53 unsigned long r9; 57 unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
57 unsigned long rdx; 61 unsigned long rdx;
58 unsigned long rsi; 62 unsigned long rsi;
59 unsigned long rdi; 63 unsigned long rdi;
64/*
65 * On syscall entry, this is syscall#. On CPU exception, this is error code.
66 * On hw interrupt, it's IRQ number:
67 */
60 unsigned long orig_rax; 68 unsigned long orig_rax;
61/* end of arguments */ 69/* Return frame for iretq */
62/* cpu exception frame or undefined */
63 unsigned long rip; 70 unsigned long rip;
64 unsigned long cs; 71 unsigned long cs;
65 unsigned long eflags; 72 unsigned long eflags;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d8b9f9081e86..16dc4e8a2cd3 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,9 +177,24 @@ struct sigcontext {
177 __u64 rip; 177 __u64 rip;
178 __u64 eflags; /* RFLAGS */ 178 __u64 eflags; /* RFLAGS */
179 __u16 cs; 179 __u16 cs;
180 __u16 gs; 180
181 __u16 fs; 181 /*
182 __u16 __pad0; 182 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
183 * Linux saved and restored fs and gs in these slots. This
184 * was counterproductive, as fsbase and gsbase were never
185 * saved, so arch_prctl was presumably unreliable.
186 *
187 * If these slots are ever needed for any other purpose, there
188 * is some risk that very old 64-bit binaries could get
189 * confused. I doubt that many such binaries still work,
190 * though, since the same patch in 2.5.64 also removed the
191 * 64-bit set_thread_area syscall, so it appears that there is
192 * no TLS API that works in both pre- and post-2.5.64 kernels.
193 */
194 __u16 __pad2; /* Was gs. */
195 __u16 __pad1; /* Was fs. */
196
197 __u16 ss;
183 __u64 err; 198 __u64 err;
184 __u64 trapno; 199 __u64 trapno;
185 __u64 oldmask; 200 __u64 oldmask;
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index c5f1a1deb91a..1fe92181ee9e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -67,6 +67,7 @@
67#define EXIT_REASON_EPT_VIOLATION 48 67#define EXIT_REASON_EPT_VIOLATION 48
68#define EXIT_REASON_EPT_MISCONFIG 49 68#define EXIT_REASON_EPT_MISCONFIG 49
69#define EXIT_REASON_INVEPT 50 69#define EXIT_REASON_INVEPT 50
70#define EXIT_REASON_RDTSCP 51
70#define EXIT_REASON_PREEMPTION_TIMER 52 71#define EXIT_REASON_PREEMPTION_TIMER 52
71#define EXIT_REASON_INVVPID 53 72#define EXIT_REASON_INVVPID 53
72#define EXIT_REASON_WBINVD 54 73#define EXIT_REASON_WBINVD 54
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70ddad0..c887cd944f0c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
33obj-$(CONFIG_X86_64) += mcount_64.o 33obj-$(CONFIG_X86_64) += mcount_64.o
34obj-y += syscall_$(BITS).o vsyscall_gtod.o 34obj-y += syscall_$(BITS).o vsyscall_gtod.o
35obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
35obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o 36obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
36obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o 37obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
37obj-$(CONFIG_SYSFS) += ksysfs.o 38obj-$(CONFIG_SYSFS) += ksysfs.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..aef653193160 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
52__setup("noreplace-paravirt", setup_noreplace_paravirt); 52__setup("noreplace-paravirt", setup_noreplace_paravirt);
53#endif 53#endif
54 54
55#define DPRINTK(fmt, ...) \ 55#define DPRINTK(fmt, args...) \
56do { \ 56do { \
57 if (debug_alternative) \ 57 if (debug_alternative) \
58 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ 58 printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
59} while (0)
60
61#define DUMP_BYTES(buf, len, fmt, args...) \
62do { \
63 if (unlikely(debug_alternative)) { \
64 int j; \
65 \
66 if (!(len)) \
67 break; \
68 \
69 printk(KERN_DEBUG fmt, ##args); \
70 for (j = 0; j < (len) - 1; j++) \
71 printk(KERN_CONT "%02hhx ", buf[j]); \
72 printk(KERN_CONT "%02hhx\n", buf[j]); \
73 } \
59} while (0) 74} while (0)
60 75
61/* 76/*
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
243extern s32 __smp_locks[], __smp_locks_end[]; 258extern s32 __smp_locks[], __smp_locks_end[];
244void *text_poke_early(void *addr, const void *opcode, size_t len); 259void *text_poke_early(void *addr, const void *opcode, size_t len);
245 260
246/* Replace instructions with better alternatives for this CPU type. 261/*
247 This runs before SMP is initialized to avoid SMP problems with 262 * Are we looking at a near JMP with a 1 or 4-byte displacement.
248 self modifying code. This implies that asymmetric systems where 263 */
249 APs have less capabilities than the boot processor are not handled. 264static inline bool is_jmp(const u8 opcode)
250 Tough. Make sure you disable such features by hand. */ 265{
266 return opcode == 0xeb || opcode == 0xe9;
267}
268
269static void __init_or_module
270recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
271{
272 u8 *next_rip, *tgt_rip;
273 s32 n_dspl, o_dspl;
274 int repl_len;
275
276 if (a->replacementlen != 5)
277 return;
278
279 o_dspl = *(s32 *)(insnbuf + 1);
280
281 /* next_rip of the replacement JMP */
282 next_rip = repl_insn + a->replacementlen;
283 /* target rip of the replacement JMP */
284 tgt_rip = next_rip + o_dspl;
285 n_dspl = tgt_rip - orig_insn;
286
287 DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
288
289 if (tgt_rip - orig_insn >= 0) {
290 if (n_dspl - 2 <= 127)
291 goto two_byte_jmp;
292 else
293 goto five_byte_jmp;
294 /* negative offset */
295 } else {
296 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
297 goto two_byte_jmp;
298 else
299 goto five_byte_jmp;
300 }
301
302two_byte_jmp:
303 n_dspl -= 2;
304
305 insnbuf[0] = 0xeb;
306 insnbuf[1] = (s8)n_dspl;
307 add_nops(insnbuf + 2, 3);
308
309 repl_len = 2;
310 goto done;
311
312five_byte_jmp:
313 n_dspl -= 5;
314
315 insnbuf[0] = 0xe9;
316 *(s32 *)&insnbuf[1] = n_dspl;
251 317
318 repl_len = 5;
319
320done:
321
322 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
323 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
324}
325
326static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
327{
328 if (instr[0] != 0x90)
329 return;
330
331 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
332
333 DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
334 instr, a->instrlen - a->padlen, a->padlen);
335}
336
337/*
338 * Replace instructions with better alternatives for this CPU type. This runs
339 * before SMP is initialized to avoid SMP problems with self modifying code.
340 * This implies that asymmetric systems where APs have less capabilities than
341 * the boot processor are not handled. Tough. Make sure you disable such
342 * features by hand.
343 */
252void __init_or_module apply_alternatives(struct alt_instr *start, 344void __init_or_module apply_alternatives(struct alt_instr *start,
253 struct alt_instr *end) 345 struct alt_instr *end)
254{ 346{
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
256 u8 *instr, *replacement; 348 u8 *instr, *replacement;
257 u8 insnbuf[MAX_PATCH_LEN]; 349 u8 insnbuf[MAX_PATCH_LEN];
258 350
259 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 351 DPRINTK("alt table %p -> %p", start, end);
260 /* 352 /*
261 * The scan order should be from start to end. A later scanned 353 * The scan order should be from start to end. A later scanned
262 * alternative code can overwrite a previous scanned alternative code. 354 * alternative code can overwrite previously scanned alternative code.
263 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 355 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
264 * patch code. 356 * patch code.
265 * 357 *
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
267 * order. 359 * order.
268 */ 360 */
269 for (a = start; a < end; a++) { 361 for (a = start; a < end; a++) {
362 int insnbuf_sz = 0;
363
270 instr = (u8 *)&a->instr_offset + a->instr_offset; 364 instr = (u8 *)&a->instr_offset + a->instr_offset;
271 replacement = (u8 *)&a->repl_offset + a->repl_offset; 365 replacement = (u8 *)&a->repl_offset + a->repl_offset;
272 BUG_ON(a->replacementlen > a->instrlen);
273 BUG_ON(a->instrlen > sizeof(insnbuf)); 366 BUG_ON(a->instrlen > sizeof(insnbuf));
274 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 367 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
275 if (!boot_cpu_has(a->cpuid)) 368 if (!boot_cpu_has(a->cpuid)) {
369 if (a->padlen > 1)
370 optimize_nops(a, instr);
371
276 continue; 372 continue;
373 }
374
375 DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
376 a->cpuid >> 5,
377 a->cpuid & 0x1f,
378 instr, a->instrlen,
379 replacement, a->replacementlen, a->padlen);
380
381 DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
382 DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
277 383
278 memcpy(insnbuf, replacement, a->replacementlen); 384 memcpy(insnbuf, replacement, a->replacementlen);
385 insnbuf_sz = a->replacementlen;
279 386
280 /* 0xe8 is a relative jump; fix the offset. */ 387 /* 0xe8 is a relative jump; fix the offset. */
281 if (*insnbuf == 0xe8 && a->replacementlen == 5) 388 if (*insnbuf == 0xe8 && a->replacementlen == 5) {
282 *(s32 *)(insnbuf + 1) += replacement - instr; 389 *(s32 *)(insnbuf + 1) += replacement - instr;
390 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
391 *(s32 *)(insnbuf + 1),
392 (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
393 }
394
395 if (a->replacementlen && is_jmp(replacement[0]))
396 recompute_jump(a, instr, replacement, insnbuf);
283 397
284 add_nops(insnbuf + a->replacementlen, 398 if (a->instrlen > a->replacementlen) {
285 a->instrlen - a->replacementlen); 399 add_nops(insnbuf + a->replacementlen,
400 a->instrlen - a->replacementlen);
401 insnbuf_sz += a->instrlen - a->replacementlen;
402 }
403 DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
286 404
287 text_poke_early(instr, insnbuf, a->instrlen); 405 text_poke_early(instr, insnbuf, insnbuf_sz);
288 } 406 }
289} 407}
290 408
291#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
292
293static void alternatives_smp_lock(const s32 *start, const s32 *end, 410static void alternatives_smp_lock(const s32 *start, const s32 *end,
294 u8 *text, u8 *text_end) 411 u8 *text, u8 *text_end)
295{ 412{
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
371 smp->locks_end = locks_end; 488 smp->locks_end = locks_end;
372 smp->text = text; 489 smp->text = text;
373 smp->text_end = text_end; 490 smp->text_end = text_end;
374 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", 491 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
375 __func__, smp->locks, smp->locks_end, 492 smp->locks, smp->locks_end,
376 smp->text, smp->text_end, smp->name); 493 smp->text, smp->text_end, smp->name);
377 494
378 list_add_tail(&smp->next, &smp_alt_modules); 495 list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end)
440 557
441 return 0; 558 return 0;
442} 559}
443#endif 560#endif /* CONFIG_SMP */
444 561
445#ifdef CONFIG_PARAVIRT 562#ifdef CONFIG_PARAVIRT
446void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 563void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs)
601 if (likely(!bp_patching_in_progress)) 718 if (likely(!bp_patching_in_progress))
602 return 0; 719 return 0;
603 720
604 if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) 721 if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
605 return 0; 722 return 0;
606 723
607 /* set up the specified breakpoint handler */ 724 /* set up the specified breakpoint handler */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad3639ae1b9b..dcb52850a28f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1084,67 +1084,6 @@ void lapic_shutdown(void)
1084 local_irq_restore(flags); 1084 local_irq_restore(flags);
1085} 1085}
1086 1086
1087/*
1088 * This is to verify that we're looking at a real local APIC.
1089 * Check these against your board if the CPUs aren't getting
1090 * started for no apparent reason.
1091 */
1092int __init verify_local_APIC(void)
1093{
1094 unsigned int reg0, reg1;
1095
1096 /*
1097 * The version register is read-only in a real APIC.
1098 */
1099 reg0 = apic_read(APIC_LVR);
1100 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
1101 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
1102 reg1 = apic_read(APIC_LVR);
1103 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
1104
1105 /*
1106 * The two version reads above should print the same
1107 * numbers. If the second one is different, then we
1108 * poke at a non-APIC.
1109 */
1110 if (reg1 != reg0)
1111 return 0;
1112
1113 /*
1114 * Check if the version looks reasonably.
1115 */
1116 reg1 = GET_APIC_VERSION(reg0);
1117 if (reg1 == 0x00 || reg1 == 0xff)
1118 return 0;
1119 reg1 = lapic_get_maxlvt();
1120 if (reg1 < 0x02 || reg1 == 0xff)
1121 return 0;
1122
1123 /*
1124 * The ID register is read/write in a real APIC.
1125 */
1126 reg0 = apic_read(APIC_ID);
1127 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
1128 apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
1129 reg1 = apic_read(APIC_ID);
1130 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
1131 apic_write(APIC_ID, reg0);
1132 if (reg1 != (reg0 ^ apic->apic_id_mask))
1133 return 0;
1134
1135 /*
1136 * The next two are just to see if we have sane values.
1137 * They're only really relevant if we're in Virtual Wire
1138 * compatibility mode, but most boxes are anymore.
1139 */
1140 reg0 = apic_read(APIC_LVT0);
1141 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
1142 reg1 = apic_read(APIC_LVT1);
1143 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
1144
1145 return 1;
1146}
1147
1148/** 1087/**
1149 * sync_Arb_IDs - synchronize APIC bus arbitration IDs 1088 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
1150 */ 1089 */
@@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void)
2283 disable_ioapic_support(); 2222 disable_ioapic_support();
2284 2223
2285 default_setup_apic_routing(); 2224 default_setup_apic_routing();
2286 verify_local_APIC();
2287 apic_bsp_setup(true); 2225 apic_bsp_setup(true);
2288 return 0; 2226 return 0;
2289} 2227}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e658f21681c8..d9d0bd2faaf4 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -135,12 +135,12 @@ static void init_x2apic_ldr(void)
135 135
136 per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); 136 per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
137 137
138 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); 138 cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
139 for_each_online_cpu(cpu) { 139 for_each_online_cpu(cpu) {
140 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) 140 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
141 continue; 141 continue;
142 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); 142 cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
143 __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); 143 cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
144 } 144 }
145} 145}
146 146
@@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void)
195 195
196 BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); 196 BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
197 197
198 __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); 198 cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
199 register_hotcpu_notifier(&x2apic_cpu_notifier); 199 register_hotcpu_notifier(&x2apic_cpu_notifier);
200 return 1; 200 return 1;
201} 201}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8e9dcfd630e4..c8d92950bc04 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -144,33 +144,60 @@ static void __init uv_set_apicid_hibit(void)
144 144
145static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 145static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
146{ 146{
147 int pnodeid, is_uv1, is_uv2, is_uv3; 147 int pnodeid;
148 148 int uv_apic;
149 is_uv1 = !strcmp(oem_id, "SGI"); 149
150 is_uv2 = !strcmp(oem_id, "SGI2"); 150 if (strncmp(oem_id, "SGI", 3) != 0)
151 is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */ 151 return 0;
152 if (is_uv1 || is_uv2 || is_uv3) { 152
153 uv_hub_info->hub_revision = 153 /*
154 (is_uv1 ? UV1_HUB_REVISION_BASE : 154 * Determine UV arch type.
155 (is_uv2 ? UV2_HUB_REVISION_BASE : 155 * SGI: UV100/1000
156 UV3_HUB_REVISION_BASE)); 156 * SGI2: UV2000/3000
157 pnodeid = early_get_pnodeid(); 157 * SGI3: UV300 (truncated to 4 chars because of different varieties)
158 early_get_apic_pnode_shift(); 158 */
159 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 159 uv_hub_info->hub_revision =
160 x86_platform.nmi_init = uv_nmi_init; 160 !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
161 if (!strcmp(oem_table_id, "UVL")) 161 !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
162 uv_system_type = UV_LEGACY_APIC; 162 !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
163 else if (!strcmp(oem_table_id, "UVX")) 163
164 uv_system_type = UV_X2APIC; 164 if (uv_hub_info->hub_revision == 0)
165 else if (!strcmp(oem_table_id, "UVH")) { 165 goto badbios;
166 __this_cpu_write(x2apic_extra_bits, 166
167 pnodeid << uvh_apicid.s.pnode_shift); 167 pnodeid = early_get_pnodeid();
168 uv_system_type = UV_NON_UNIQUE_APIC; 168 early_get_apic_pnode_shift();
169 uv_set_apicid_hibit(); 169 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
170 return 1; 170 x86_platform.nmi_init = uv_nmi_init;
171 } 171
172 if (!strcmp(oem_table_id, "UVX")) { /* most common */
173 uv_system_type = UV_X2APIC;
174 uv_apic = 0;
175
176 } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */
177 uv_system_type = UV_NON_UNIQUE_APIC;
178 __this_cpu_write(x2apic_extra_bits,
179 pnodeid << uvh_apicid.s.pnode_shift);
180 uv_set_apicid_hibit();
181 uv_apic = 1;
182
183 } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */
184 uv_system_type = UV_LEGACY_APIC; /* very small systems */
185 uv_apic = 0;
186
187 } else {
188 goto badbios;
172 } 189 }
173 return 0; 190
191 pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
192 oem_id, oem_table_id, uv_system_type,
193 uv_min_hub_revision_id, uv_apic);
194
195 return uv_apic;
196
197badbios:
198 pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
199 pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
200 BUG();
174} 201}
175 202
176enum uv_system_type get_uv_system_type(void) 203enum uv_system_type get_uv_system_type(void)
@@ -854,10 +881,14 @@ void __init uv_system_init(void)
854 unsigned long mmr_base, present, paddr; 881 unsigned long mmr_base, present, paddr;
855 unsigned short pnode_mask; 882 unsigned short pnode_mask;
856 unsigned char n_lshift; 883 unsigned char n_lshift;
857 char *hub = (is_uv1_hub() ? "UV1" : 884 char *hub = (is_uv1_hub() ? "UV100/1000" :
858 (is_uv2_hub() ? "UV2" : 885 (is_uv2_hub() ? "UV2000/3000" :
859 "UV3")); 886 (is_uv3_hub() ? "UV300" : NULL)));
860 887
888 if (!hub) {
889 pr_err("UV: Unknown/unsupported UV hub\n");
890 return;
891 }
861 pr_info("UV: Found %s hub\n", hub); 892 pr_info("UV: Found %s hub\n", hub);
862 map_low_mmrs(); 893 map_low_mmrs();
863 894
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d33ac1d..47703aed74cf 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@ void foo(void)
68 68
69 /* Offset from the sysenter stack to tss.sp0 */ 69 /* Offset from the sysenter stack to tss.sp0 */
70 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 70 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
71 sizeof(struct tss_struct)); 71 offsetofend(struct tss_struct, SYSENTER_stack));
72 72
73#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 73#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
74 BLANK(); 74 BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d27c9f..5ce6f2da8763 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@ int main(void)
81#undef ENTRY 81#undef ENTRY
82 82
83 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 83 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
84 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
84 BLANK(); 85 BLANK();
85 86
86 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); 87 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..fd470ebf924e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/io.h> 6#include <linux/io.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/random.h>
8#include <asm/processor.h> 9#include <asm/processor.h>
9#include <asm/apic.h> 10#include <asm/apic.h>
10#include <asm/cpu.h> 11#include <asm/cpu.h>
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
488 489
489 va_align.mask = (upperbit - 1) & PAGE_MASK; 490 va_align.mask = (upperbit - 1) & PAGE_MASK;
490 va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; 491 va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
492
493 /* A random value per boot for bit slice [12:upper_bit) */
494 va_align.bits = get_random_int() & va_align.mask;
491 } 495 }
492} 496}
493 497
@@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c)
711 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); 715 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
712 716
713 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 717 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
718
719 /* 3DNow or LM implies PREFETCHW */
720 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
721 if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
722 set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
714} 723}
715 724
716#ifdef CONFIG_X86_32 725#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1cd4a1a44b95..a62cf04dac8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -998,38 +998,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
998#endif 998#endif
999} 999}
1000 1000
1001#ifdef CONFIG_X86_64 1001/*
1002#ifdef CONFIG_IA32_EMULATION 1002 * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
1003/* May not be __init: called during resume */ 1003 * on 32-bit kernels:
1004static void syscall32_cpu_init(void) 1004 */
1005{
1006 /* Load these always in case some future AMD CPU supports
1007 SYSENTER from compat mode too. */
1008 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
1009 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1010 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
1011
1012 wrmsrl(MSR_CSTAR, ia32_cstar_target);
1013}
1014#endif /* CONFIG_IA32_EMULATION */
1015#endif /* CONFIG_X86_64 */
1016
1017#ifdef CONFIG_X86_32 1005#ifdef CONFIG_X86_32
1018void enable_sep_cpu(void) 1006void enable_sep_cpu(void)
1019{ 1007{
1020 int cpu = get_cpu(); 1008 struct tss_struct *tss;
1021 struct tss_struct *tss = &per_cpu(init_tss, cpu); 1009 int cpu;
1022 1010
1023 if (!boot_cpu_has(X86_FEATURE_SEP)) { 1011 cpu = get_cpu();
1024 put_cpu(); 1012 tss = &per_cpu(cpu_tss, cpu);
1025 return; 1013
1026 } 1014 if (!boot_cpu_has(X86_FEATURE_SEP))
1015 goto out;
1016
1017 /*
1018 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
1019 * see the big comment in struct x86_hw_tss's definition.
1020 */
1027 1021
1028 tss->x86_tss.ss1 = __KERNEL_CS; 1022 tss->x86_tss.ss1 = __KERNEL_CS;
1029 tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; 1023 wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
1030 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 1024
1031 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); 1025 wrmsr(MSR_IA32_SYSENTER_ESP,
1032 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); 1026 (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
1027 0);
1028
1029 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
1030
1031out:
1033 put_cpu(); 1032 put_cpu();
1034} 1033}
1035#endif 1034#endif
@@ -1157,7 +1156,7 @@ static __init int setup_disablecpuid(char *arg)
1157__setup("clearcpuid=", setup_disablecpuid); 1156__setup("clearcpuid=", setup_disablecpuid);
1158 1157
1159DEFINE_PER_CPU(unsigned long, kernel_stack) = 1158DEFINE_PER_CPU(unsigned long, kernel_stack) =
1160 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; 1159 (unsigned long)&init_thread_union + THREAD_SIZE;
1161EXPORT_PER_CPU_SYMBOL(kernel_stack); 1160EXPORT_PER_CPU_SYMBOL(kernel_stack);
1162 1161
1163#ifdef CONFIG_X86_64 1162#ifdef CONFIG_X86_64
@@ -1169,8 +1168,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
1169 irq_stack_union) __aligned(PAGE_SIZE) __visible; 1168 irq_stack_union) __aligned(PAGE_SIZE) __visible;
1170 1169
1171/* 1170/*
1172 * The following four percpu variables are hot. Align current_task to 1171 * The following percpu variables are hot. Align current_task to
1173 * cacheline size such that all four fall in the same cacheline. 1172 * cacheline size such that they fall in the same cacheline.
1174 */ 1173 */
1175DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = 1174DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
1176 &init_task; 1175 &init_task;
@@ -1210,10 +1209,23 @@ void syscall_init(void)
1210 */ 1209 */
1211 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); 1210 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1212 wrmsrl(MSR_LSTAR, system_call); 1211 wrmsrl(MSR_LSTAR, system_call);
1213 wrmsrl(MSR_CSTAR, ignore_sysret);
1214 1212
1215#ifdef CONFIG_IA32_EMULATION 1213#ifdef CONFIG_IA32_EMULATION
1216 syscall32_cpu_init(); 1214 wrmsrl(MSR_CSTAR, ia32_cstar_target);
1215 /*
1216 * This only works on Intel CPUs.
1217 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
1218 * This does not cause SYSENTER to jump to the wrong location, because
1219 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
1220 */
1221 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
1222 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1223 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
1224#else
1225 wrmsrl(MSR_CSTAR, ignore_sysret);
1226 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
1227 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1228 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
1217#endif 1229#endif
1218 1230
1219 /* Flags to clear on syscall */ 1231 /* Flags to clear on syscall */
@@ -1265,6 +1277,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1265EXPORT_PER_CPU_SYMBOL(__preempt_count); 1277EXPORT_PER_CPU_SYMBOL(__preempt_count);
1266DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1278DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1267 1279
1280/*
1281 * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
1282 * the top of the kernel stack. Use an extra percpu variable to track the
1283 * top of the kernel stack directly.
1284 */
1285DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
1286 (unsigned long)&init_thread_union + THREAD_SIZE;
1287EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
1288
1268#ifdef CONFIG_CC_STACKPROTECTOR 1289#ifdef CONFIG_CC_STACKPROTECTOR
1269DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1290DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
1270#endif 1291#endif
@@ -1346,7 +1367,7 @@ void cpu_init(void)
1346 */ 1367 */
1347 load_ucode_ap(); 1368 load_ucode_ap();
1348 1369
1349 t = &per_cpu(init_tss, cpu); 1370 t = &per_cpu(cpu_tss, cpu);
1350 oist = &per_cpu(orig_ist, cpu); 1371 oist = &per_cpu(orig_ist, cpu);
1351 1372
1352#ifdef CONFIG_NUMA 1373#ifdef CONFIG_NUMA
@@ -1430,7 +1451,7 @@ void cpu_init(void)
1430{ 1451{
1431 int cpu = smp_processor_id(); 1452 int cpu = smp_processor_id();
1432 struct task_struct *curr = current; 1453 struct task_struct *curr = current;
1433 struct tss_struct *t = &per_cpu(init_tss, cpu); 1454 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
1434 struct thread_struct *thread = &curr->thread; 1455 struct thread_struct *thread = &curr->thread;
1435 1456
1436 wait_for_master_cpu(cpu); 1457 wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 659643376dbf..edcb0e28c336 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -7,16 +7,14 @@
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
9 9
10#include <linux/init.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/device.h> 11#include <linux/cacheinfo.h>
13#include <linux/compiler.h>
14#include <linux/cpu.h> 12#include <linux/cpu.h>
15#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/sysfs.h>
16#include <linux/pci.h> 15#include <linux/pci.h>
17 16
18#include <asm/processor.h> 17#include <asm/processor.h>
19#include <linux/smp.h>
20#include <asm/amd_nb.h> 18#include <asm/amd_nb.h>
21#include <asm/smp.h> 19#include <asm/smp.h>
22 20
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] =
116 114
117 115
118enum _cache_type { 116enum _cache_type {
119 CACHE_TYPE_NULL = 0, 117 CTYPE_NULL = 0,
120 CACHE_TYPE_DATA = 1, 118 CTYPE_DATA = 1,
121 CACHE_TYPE_INST = 2, 119 CTYPE_INST = 2,
122 CACHE_TYPE_UNIFIED = 3 120 CTYPE_UNIFIED = 3
123}; 121};
124 122
125union _cpuid4_leaf_eax { 123union _cpuid4_leaf_eax {
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs {
159 struct amd_northbridge *nb; 157 struct amd_northbridge *nb;
160}; 158};
161 159
162struct _cpuid4_info {
163 struct _cpuid4_info_regs base;
164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
165};
166
167unsigned short num_cache_leaves; 160unsigned short num_cache_leaves;
168 161
169/* AMD doesn't have CPUID4. Emulate it here to report the same 162/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = {
220static const unsigned char levels[] = { 1, 1, 2, 3 }; 213static const unsigned char levels[] = { 1, 1, 2, 3 };
221static const unsigned char types[] = { 1, 2, 3, 3 }; 214static const unsigned char types[] = { 1, 2, 3, 3 };
222 215
216static const enum cache_type cache_type_map[] = {
217 [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
218 [CTYPE_DATA] = CACHE_TYPE_DATA,
219 [CTYPE_INST] = CACHE_TYPE_INST,
220 [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
221};
222
223static void 223static void
224amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 224amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
225 union _cpuid4_leaf_ebx *ebx, 225 union _cpuid4_leaf_ebx *ebx,
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
291 (ebx->split.ways_of_associativity + 1) - 1; 291 (ebx->split.ways_of_associativity + 1) - 1;
292} 292}
293 293
294struct _cache_attr {
295 struct attribute attr;
296 ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
297 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
298 unsigned int);
299};
300
301#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) 294#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
295
302/* 296/*
303 * L3 cache descriptors 297 * L3 cache descriptors
304 */ 298 */
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
325 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 319 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
326} 320}
327 321
328static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
329{
330 int node;
331
332 /* only for L3, and not in virtualized environments */
333 if (index < 3)
334 return;
335
336 node = amd_get_nb_id(smp_processor_id());
337 this_leaf->nb = node_to_amd_nb(node);
338 if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
339 amd_calc_l3_indices(this_leaf->nb);
340}
341
342/* 322/*
343 * check whether a slot used for disabling an L3 index is occupied. 323 * check whether a slot used for disabling an L3 index is occupied.
344 * @l3: L3 cache descriptor 324 * @l3: L3 cache descriptor
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
359 return -1; 339 return -1;
360} 340}
361 341
362static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 342static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
363 unsigned int slot) 343 unsigned int slot)
364{ 344{
365 int index; 345 int index;
346 struct amd_northbridge *nb = this_leaf->priv;
366 347
367 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 348 index = amd_get_l3_disable_slot(nb, slot);
368 return -EINVAL;
369
370 index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
371 if (index >= 0) 349 if (index >= 0)
372 return sprintf(buf, "%d\n", index); 350 return sprintf(buf, "%d\n", index);
373 351
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
376 354
377#define SHOW_CACHE_DISABLE(slot) \ 355#define SHOW_CACHE_DISABLE(slot) \
378static ssize_t \ 356static ssize_t \
379show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ 357cache_disable_##slot##_show(struct device *dev, \
380 unsigned int cpu) \ 358 struct device_attribute *attr, char *buf) \
381{ \ 359{ \
360 struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
382 return show_cache_disable(this_leaf, buf, slot); \ 361 return show_cache_disable(this_leaf, buf, slot); \
383} 362}
384SHOW_CACHE_DISABLE(0) 363SHOW_CACHE_DISABLE(0)
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
446 return 0; 425 return 0;
447} 426}
448 427
449static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 428static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
450 const char *buf, size_t count, 429 const char *buf, size_t count,
451 unsigned int slot) 430 unsigned int slot)
452{ 431{
453 unsigned long val = 0; 432 unsigned long val = 0;
454 int cpu, err = 0; 433 int cpu, err = 0;
434 struct amd_northbridge *nb = this_leaf->priv;
455 435
456 if (!capable(CAP_SYS_ADMIN)) 436 if (!capable(CAP_SYS_ADMIN))
457 return -EPERM; 437 return -EPERM;
458 438
459 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 439 cpu = cpumask_first(&this_leaf->shared_cpu_map);
460 return -EINVAL;
461
462 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
463 440
464 if (kstrtoul(buf, 10, &val) < 0) 441 if (kstrtoul(buf, 10, &val) < 0)
465 return -EINVAL; 442 return -EINVAL;
466 443
467 err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); 444 err = amd_set_l3_disable_slot(nb, cpu, slot, val);
468 if (err) { 445 if (err) {
469 if (err == -EEXIST) 446 if (err == -EEXIST)
470 pr_warning("L3 slot %d in use/index already disabled!\n", 447 pr_warning("L3 slot %d in use/index already disabled!\n",
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
476 453
477#define STORE_CACHE_DISABLE(slot) \ 454#define STORE_CACHE_DISABLE(slot) \
478static ssize_t \ 455static ssize_t \
479store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 456cache_disable_##slot##_store(struct device *dev, \
480 const char *buf, size_t count, \ 457 struct device_attribute *attr, \
481 unsigned int cpu) \ 458 const char *buf, size_t count) \
482{ \ 459{ \
460 struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
483 return store_cache_disable(this_leaf, buf, count, slot); \ 461 return store_cache_disable(this_leaf, buf, count, slot); \
484} 462}
485STORE_CACHE_DISABLE(0) 463STORE_CACHE_DISABLE(0)
486STORE_CACHE_DISABLE(1) 464STORE_CACHE_DISABLE(1)
487 465
488static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 466static ssize_t subcaches_show(struct device *dev,
489 show_cache_disable_0, store_cache_disable_0); 467 struct device_attribute *attr, char *buf)
490static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
491 show_cache_disable_1, store_cache_disable_1);
492
493static ssize_t
494show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
495{ 468{
496 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) 469 struct cacheinfo *this_leaf = dev_get_drvdata(dev);
497 return -EINVAL; 470 int cpu = cpumask_first(&this_leaf->shared_cpu_map);
498 471
499 return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); 472 return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
500} 473}
501 474
502static ssize_t 475static ssize_t subcaches_store(struct device *dev,
503store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, 476 struct device_attribute *attr,
504 unsigned int cpu) 477 const char *buf, size_t count)
505{ 478{
479 struct cacheinfo *this_leaf = dev_get_drvdata(dev);
480 int cpu = cpumask_first(&this_leaf->shared_cpu_map);
506 unsigned long val; 481 unsigned long val;
507 482
508 if (!capable(CAP_SYS_ADMIN)) 483 if (!capable(CAP_SYS_ADMIN))
509 return -EPERM; 484 return -EPERM;
510 485
511 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
512 return -EINVAL;
513
514 if (kstrtoul(buf, 16, &val) < 0) 486 if (kstrtoul(buf, 16, &val) < 0)
515 return -EINVAL; 487 return -EINVAL;
516 488
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
520 return count; 492 return count;
521} 493}
522 494
523static struct _cache_attr subcaches = 495static DEVICE_ATTR_RW(cache_disable_0);
524 __ATTR(subcaches, 0644, show_subcaches, store_subcaches); 496static DEVICE_ATTR_RW(cache_disable_1);
497static DEVICE_ATTR_RW(subcaches);
498
499static umode_t
500cache_private_attrs_is_visible(struct kobject *kobj,
501 struct attribute *attr, int unused)
502{
503 struct device *dev = kobj_to_dev(kobj);
504 struct cacheinfo *this_leaf = dev_get_drvdata(dev);
505 umode_t mode = attr->mode;
506
507 if (!this_leaf->priv)
508 return 0;
509
510 if ((attr == &dev_attr_subcaches.attr) &&
511 amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
512 return mode;
513
514 if ((attr == &dev_attr_cache_disable_0.attr ||
515 attr == &dev_attr_cache_disable_1.attr) &&
516 amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
517 return mode;
518
519 return 0;
520}
521
522static struct attribute_group cache_private_group = {
523 .is_visible = cache_private_attrs_is_visible,
524};
525
526static void init_amd_l3_attrs(void)
527{
528 int n = 1;
529 static struct attribute **amd_l3_attrs;
530
531 if (amd_l3_attrs) /* already initialized */
532 return;
533
534 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
535 n += 2;
536 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
537 n += 1;
538
539 amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
540 if (!amd_l3_attrs)
541 return;
542
543 n = 0;
544 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
545 amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
546 amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
547 }
548 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
549 amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
525 550
551 cache_private_group.attrs = amd_l3_attrs;
552}
553
554const struct attribute_group *
555cache_get_priv_group(struct cacheinfo *this_leaf)
556{
557 struct amd_northbridge *nb = this_leaf->priv;
558
559 if (this_leaf->level < 3 || !nb)
560 return NULL;
561
562 if (nb && nb->l3_cache.indices)
563 init_amd_l3_attrs();
564
565 return &cache_private_group;
566}
567
568static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
569{
570 int node;
571
572 /* only for L3, and not in virtualized environments */
573 if (index < 3)
574 return;
575
576 node = amd_get_nb_id(smp_processor_id());
577 this_leaf->nb = node_to_amd_nb(node);
578 if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
579 amd_calc_l3_indices(this_leaf->nb);
580}
526#else 581#else
527#define amd_init_l3_cache(x, y) 582#define amd_init_l3_cache(x, y)
528#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ 583#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
546 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 601 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
547 } 602 }
548 603
549 if (eax.split.type == CACHE_TYPE_NULL) 604 if (eax.split.type == CTYPE_NULL)
550 return -EIO; /* better error ? */ 605 return -EIO; /* better error ? */
551 606
552 this_leaf->eax = eax; 607 this_leaf->eax = eax;
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
575 /* Do cpuid(op) loop to find out num_cache_leaves */ 630 /* Do cpuid(op) loop to find out num_cache_leaves */
576 cpuid_count(op, i, &eax, &ebx, &ecx, &edx); 631 cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
577 cache_eax.full = eax; 632 cache_eax.full = eax;
578 } while (cache_eax.split.type != CACHE_TYPE_NULL); 633 } while (cache_eax.split.type != CTYPE_NULL);
579 return i; 634 return i;
580} 635}
581 636
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
626 681
627 switch (this_leaf.eax.split.level) { 682 switch (this_leaf.eax.split.level) {
628 case 1: 683 case 1:
629 if (this_leaf.eax.split.type == CACHE_TYPE_DATA) 684 if (this_leaf.eax.split.type == CTYPE_DATA)
630 new_l1d = this_leaf.size/1024; 685 new_l1d = this_leaf.size/1024;
631 else if (this_leaf.eax.split.type == CACHE_TYPE_INST) 686 else if (this_leaf.eax.split.type == CTYPE_INST)
632 new_l1i = this_leaf.size/1024; 687 new_l1i = this_leaf.size/1024;
633 break; 688 break;
634 case 2: 689 case 2:
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
747 return l2; 802 return l2;
748} 803}
749 804
750#ifdef CONFIG_SYSFS 805static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
751 806 struct _cpuid4_info_regs *base)
752/* pointer to _cpuid4_info array (for each cache leaf) */
753static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
754#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
755
756#ifdef CONFIG_SMP
757
758static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
759{ 807{
760 struct _cpuid4_info *this_leaf; 808 struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
809 struct cacheinfo *this_leaf;
761 int i, sibling; 810 int i, sibling;
762 811
763 if (cpu_has_topoext) { 812 if (cpu_has_topoext) {
764 unsigned int apicid, nshared, first, last; 813 unsigned int apicid, nshared, first, last;
765 814
766 if (!per_cpu(ici_cpuid4_info, cpu)) 815 this_leaf = this_cpu_ci->info_list + index;
767 return 0; 816 nshared = base->eax.split.num_threads_sharing + 1;
768
769 this_leaf = CPUID4_INFO_IDX(cpu, index);
770 nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
771 apicid = cpu_data(cpu).apicid; 817 apicid = cpu_data(cpu).apicid;
772 first = apicid - (apicid % nshared); 818 first = apicid - (apicid % nshared);
773 last = first + nshared - 1; 819 last = first + nshared - 1;
774 820
775 for_each_online_cpu(i) { 821 for_each_online_cpu(i) {
822 this_cpu_ci = get_cpu_cacheinfo(i);
823 if (!this_cpu_ci->info_list)
824 continue;
825
776 apicid = cpu_data(i).apicid; 826 apicid = cpu_data(i).apicid;
777 if ((apicid < first) || (apicid > last)) 827 if ((apicid < first) || (apicid > last))
778 continue; 828 continue;
779 if (!per_cpu(ici_cpuid4_info, i)) 829
780 continue; 830 this_leaf = this_cpu_ci->info_list + index;
781 this_leaf = CPUID4_INFO_IDX(i, index);
782 831
783 for_each_online_cpu(sibling) { 832 for_each_online_cpu(sibling) {
784 apicid = cpu_data(sibling).apicid; 833 apicid = cpu_data(sibling).apicid;
785 if ((apicid < first) || (apicid > last)) 834 if ((apicid < first) || (apicid > last))
786 continue; 835 continue;
787 set_bit(sibling, this_leaf->shared_cpu_map); 836 cpumask_set_cpu(sibling,
837 &this_leaf->shared_cpu_map);
788 } 838 }
789 } 839 }
790 } else if (index == 3) { 840 } else if (index == 3) {
791 for_each_cpu(i, cpu_llc_shared_mask(cpu)) { 841 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
792 if (!per_cpu(ici_cpuid4_info, i)) 842 this_cpu_ci = get_cpu_cacheinfo(i);
843 if (!this_cpu_ci->info_list)
793 continue; 844 continue;
794 this_leaf = CPUID4_INFO_IDX(i, index); 845 this_leaf = this_cpu_ci->info_list + index;
795 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { 846 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
796 if (!cpu_online(sibling)) 847 if (!cpu_online(sibling))
797 continue; 848 continue;
798 set_bit(sibling, this_leaf->shared_cpu_map); 849 cpumask_set_cpu(sibling,
850 &this_leaf->shared_cpu_map);
799 } 851 }
800 } 852 }
801 } else 853 } else
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
804 return 1; 856 return 1;
805} 857}
806 858
807static void cache_shared_cpu_map_setup(unsigned int cpu, int index) 859static void __cache_cpumap_setup(unsigned int cpu, int index,
860 struct _cpuid4_info_regs *base)
808{ 861{
809 struct _cpuid4_info *this_leaf, *sibling_leaf; 862 struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
863 struct cacheinfo *this_leaf, *sibling_leaf;
810 unsigned long num_threads_sharing; 864 unsigned long num_threads_sharing;
811 int index_msb, i; 865 int index_msb, i;
812 struct cpuinfo_x86 *c = &cpu_data(cpu); 866 struct cpuinfo_x86 *c = &cpu_data(cpu);
813 867
814 if (c->x86_vendor == X86_VENDOR_AMD) { 868 if (c->x86_vendor == X86_VENDOR_AMD) {
815 if (cache_shared_amd_cpu_map_setup(cpu, index)) 869 if (__cache_amd_cpumap_setup(cpu, index, base))
816 return; 870 return;
817 } 871 }
818 872
819 this_leaf = CPUID4_INFO_IDX(cpu, index); 873 this_leaf = this_cpu_ci->info_list + index;
820 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; 874 num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
821 875
876 cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
822 if (num_threads_sharing == 1) 877 if (num_threads_sharing == 1)
823 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); 878 return;
824 else {
825 index_msb = get_count_order(num_threads_sharing);
826
827 for_each_online_cpu(i) {
828 if (cpu_data(i).apicid >> index_msb ==
829 c->apicid >> index_msb) {
830 cpumask_set_cpu(i,
831 to_cpumask(this_leaf->shared_cpu_map));
832 if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
833 sibling_leaf =
834 CPUID4_INFO_IDX(i, index);
835 cpumask_set_cpu(cpu, to_cpumask(
836 sibling_leaf->shared_cpu_map));
837 }
838 }
839 }
840 }
841}
842static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
843{
844 struct _cpuid4_info *this_leaf, *sibling_leaf;
845 int sibling;
846
847 this_leaf = CPUID4_INFO_IDX(cpu, index);
848 for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
849 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
850 cpumask_clear_cpu(cpu,
851 to_cpumask(sibling_leaf->shared_cpu_map));
852 }
853}
854#else
855static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
856{
857}
858
859static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
860{
861}
862#endif
863
864static void free_cache_attributes(unsigned int cpu)
865{
866 int i;
867
868 for (i = 0; i < num_cache_leaves; i++)
869 cache_remove_shared_cpu_map(cpu, i);
870
871 kfree(per_cpu(ici_cpuid4_info, cpu));
872 per_cpu(ici_cpuid4_info, cpu) = NULL;
873}
874
875static void get_cpu_leaves(void *_retval)
876{
877 int j, *retval = _retval, cpu = smp_processor_id();
878 879
879 /* Do cpuid and store the results */ 880 index_msb = get_count_order(num_threads_sharing);
880 for (j = 0; j < num_cache_leaves; j++) {
881 struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
882 881
883 *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); 882 for_each_online_cpu(i)
884 if (unlikely(*retval < 0)) { 883 if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
885 int i; 884 struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
886 885
887 for (i = 0; i < j; i++) 886 if (i == cpu || !sib_cpu_ci->info_list)
888 cache_remove_shared_cpu_map(cpu, i); 887 continue;/* skip if itself or no cacheinfo */
889 break; 888 sibling_leaf = sib_cpu_ci->info_list + index;
889 cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
890 cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
890 } 891 }
891 cache_shared_cpu_map_setup(cpu, j);
892 }
893} 892}
894 893
895static int detect_cache_attributes(unsigned int cpu) 894static void ci_leaf_init(struct cacheinfo *this_leaf,
895 struct _cpuid4_info_regs *base)
896{ 896{
897 int retval; 897 this_leaf->level = base->eax.split.level;
898 898 this_leaf->type = cache_type_map[base->eax.split.type];
899 if (num_cache_leaves == 0) 899 this_leaf->coherency_line_size =
900 return -ENOENT; 900 base->ebx.split.coherency_line_size + 1;
901 901 this_leaf->ways_of_associativity =
902 per_cpu(ici_cpuid4_info, cpu) = kzalloc( 902 base->ebx.split.ways_of_associativity + 1;
903 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 903 this_leaf->size = base->size;
904 if (per_cpu(ici_cpuid4_info, cpu) == NULL) 904 this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
905 return -ENOMEM; 905 this_leaf->physical_line_partition =
906 906 base->ebx.split.physical_line_partition + 1;
907 smp_call_function_single(cpu, get_cpu_leaves, &retval, true); 907 this_leaf->priv = base->nb;
908 if (retval) {
909 kfree(per_cpu(ici_cpuid4_info, cpu));
910 per_cpu(ici_cpuid4_info, cpu) = NULL;
911 }
912
913 return retval;
914} 908}
915 909
916#include <linux/kobject.h> 910static int __init_cache_level(unsigned int cpu)
917#include <linux/sysfs.h>
918#include <linux/cpu.h>
919
920/* pointer to kobject for cpuX/cache */
921static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
922
923struct _index_kobject {
924 struct kobject kobj;
925 unsigned int cpu;
926 unsigned short index;
927};
928
929/* pointer to array of kobjects for cpuX/cache/indexY */
930static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
931#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
932
933#define show_one_plus(file_name, object, val) \
934static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
935 unsigned int cpu) \
936{ \
937 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
938}
939
940show_one_plus(level, base.eax.split.level, 0);
941show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
942show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
943show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
944show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
945
946static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
947 unsigned int cpu)
948{
949 return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
950}
951
952static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
953 int type, char *buf)
954{
955 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
956 int ret;
957
958 if (type)
959 ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
960 cpumask_pr_args(mask));
961 else
962 ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
963 cpumask_pr_args(mask));
964 buf[ret++] = '\n';
965 buf[ret] = '\0';
966 return ret;
967}
968
969static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
970 unsigned int cpu)
971{ 911{
972 return show_shared_cpu_map_func(leaf, 0, buf); 912 struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
973}
974
975static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
976 unsigned int cpu)
977{
978 return show_shared_cpu_map_func(leaf, 1, buf);
979}
980 913
981static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, 914 if (!num_cache_leaves)
982 unsigned int cpu)
983{
984 switch (this_leaf->base.eax.split.type) {
985 case CACHE_TYPE_DATA:
986 return sprintf(buf, "Data\n");
987 case CACHE_TYPE_INST:
988 return sprintf(buf, "Instruction\n");
989 case CACHE_TYPE_UNIFIED:
990 return sprintf(buf, "Unified\n");
991 default:
992 return sprintf(buf, "Unknown\n");
993 }
994}
995
996#define to_object(k) container_of(k, struct _index_kobject, kobj)
997#define to_attr(a) container_of(a, struct _cache_attr, attr)
998
999#define define_one_ro(_name) \
1000static struct _cache_attr _name = \
1001 __ATTR(_name, 0444, show_##_name, NULL)
1002
1003define_one_ro(level);
1004define_one_ro(type);
1005define_one_ro(coherency_line_size);
1006define_one_ro(physical_line_partition);
1007define_one_ro(ways_of_associativity);
1008define_one_ro(number_of_sets);
1009define_one_ro(size);
1010define_one_ro(shared_cpu_map);
1011define_one_ro(shared_cpu_list);
1012
1013static struct attribute *default_attrs[] = {
1014 &type.attr,
1015 &level.attr,
1016 &coherency_line_size.attr,
1017 &physical_line_partition.attr,
1018 &ways_of_associativity.attr,
1019 &number_of_sets.attr,
1020 &size.attr,
1021 &shared_cpu_map.attr,
1022 &shared_cpu_list.attr,
1023 NULL
1024};
1025
1026#ifdef CONFIG_AMD_NB
1027static struct attribute **amd_l3_attrs(void)
1028{
1029 static struct attribute **attrs;
1030 int n;
1031
1032 if (attrs)
1033 return attrs;
1034
1035 n = ARRAY_SIZE(default_attrs);
1036
1037 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
1038 n += 2;
1039
1040 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1041 n += 1;
1042
1043 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
1044 if (attrs == NULL)
1045 return attrs = default_attrs;
1046
1047 for (n = 0; default_attrs[n]; n++)
1048 attrs[n] = default_attrs[n];
1049
1050 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
1051 attrs[n++] = &cache_disable_0.attr;
1052 attrs[n++] = &cache_disable_1.attr;
1053 }
1054
1055 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1056 attrs[n++] = &subcaches.attr;
1057
1058 return attrs;
1059}
1060#endif
1061
1062static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
1063{
1064 struct _cache_attr *fattr = to_attr(attr);
1065 struct _index_kobject *this_leaf = to_object(kobj);
1066 ssize_t ret;
1067
1068 ret = fattr->show ?
1069 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1070 buf, this_leaf->cpu) :
1071 0;
1072 return ret;
1073}
1074
1075static ssize_t store(struct kobject *kobj, struct attribute *attr,
1076 const char *buf, size_t count)
1077{
1078 struct _cache_attr *fattr = to_attr(attr);
1079 struct _index_kobject *this_leaf = to_object(kobj);
1080 ssize_t ret;
1081
1082 ret = fattr->store ?
1083 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1084 buf, count, this_leaf->cpu) :
1085 0;
1086 return ret;
1087}
1088
1089static const struct sysfs_ops sysfs_ops = {
1090 .show = show,
1091 .store = store,
1092};
1093
1094static struct kobj_type ktype_cache = {
1095 .sysfs_ops = &sysfs_ops,
1096 .default_attrs = default_attrs,
1097};
1098
1099static struct kobj_type ktype_percpu_entry = {
1100 .sysfs_ops = &sysfs_ops,
1101};
1102
1103static void cpuid4_cache_sysfs_exit(unsigned int cpu)
1104{
1105 kfree(per_cpu(ici_cache_kobject, cpu));
1106 kfree(per_cpu(ici_index_kobject, cpu));
1107 per_cpu(ici_cache_kobject, cpu) = NULL;
1108 per_cpu(ici_index_kobject, cpu) = NULL;
1109 free_cache_attributes(cpu);
1110}
1111
1112static int cpuid4_cache_sysfs_init(unsigned int cpu)
1113{
1114 int err;
1115
1116 if (num_cache_leaves == 0)
1117 return -ENOENT; 915 return -ENOENT;
1118 916 if (!this_cpu_ci)
1119 err = detect_cache_attributes(cpu); 917 return -EINVAL;
1120 if (err) 918 this_cpu_ci->num_levels = 3;
1121 return err; 919 this_cpu_ci->num_leaves = num_cache_leaves;
1122
1123 /* Allocate all required memory */
1124 per_cpu(ici_cache_kobject, cpu) =
1125 kzalloc(sizeof(struct kobject), GFP_KERNEL);
1126 if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
1127 goto err_out;
1128
1129 per_cpu(ici_index_kobject, cpu) = kzalloc(
1130 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
1131 if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
1132 goto err_out;
1133
1134 return 0; 920 return 0;
1135
1136err_out:
1137 cpuid4_cache_sysfs_exit(cpu);
1138 return -ENOMEM;
1139} 921}
1140 922
1141static DECLARE_BITMAP(cache_dev_map, NR_CPUS); 923static int __populate_cache_leaves(unsigned int cpu)
1142
1143/* Add/Remove cache interface for CPU device */
1144static int cache_add_dev(struct device *dev)
1145{ 924{
1146 unsigned int cpu = dev->id; 925 unsigned int idx, ret;
1147 unsigned long i, j; 926 struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
1148 struct _index_kobject *this_object; 927 struct cacheinfo *this_leaf = this_cpu_ci->info_list;
1149 struct _cpuid4_info *this_leaf; 928 struct _cpuid4_info_regs id4_regs = {};
1150 int retval;
1151
1152 retval = cpuid4_cache_sysfs_init(cpu);
1153 if (unlikely(retval < 0))
1154 return retval;
1155
1156 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
1157 &ktype_percpu_entry,
1158 &dev->kobj, "%s", "cache");
1159 if (retval < 0) {
1160 cpuid4_cache_sysfs_exit(cpu);
1161 return retval;
1162 }
1163 929
1164 for (i = 0; i < num_cache_leaves; i++) { 930 for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
1165 this_object = INDEX_KOBJECT_PTR(cpu, i); 931 ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
1166 this_object->cpu = cpu; 932 if (ret)
1167 this_object->index = i; 933 return ret;
1168 934 ci_leaf_init(this_leaf++, &id4_regs);
1169 this_leaf = CPUID4_INFO_IDX(cpu, i); 935 __cache_cpumap_setup(cpu, idx, &id4_regs);
1170
1171 ktype_cache.default_attrs = default_attrs;
1172#ifdef CONFIG_AMD_NB
1173 if (this_leaf->base.nb)
1174 ktype_cache.default_attrs = amd_l3_attrs();
1175#endif
1176 retval = kobject_init_and_add(&(this_object->kobj),
1177 &ktype_cache,
1178 per_cpu(ici_cache_kobject, cpu),
1179 "index%1lu", i);
1180 if (unlikely(retval)) {
1181 for (j = 0; j < i; j++)
1182 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
1183 kobject_put(per_cpu(ici_cache_kobject, cpu));
1184 cpuid4_cache_sysfs_exit(cpu);
1185 return retval;
1186 }
1187 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
1188 } 936 }
1189 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
1190
1191 kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
1192 return 0; 937 return 0;
1193} 938}
1194 939
1195static void cache_remove_dev(struct device *dev) 940DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
1196{ 941DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
1197 unsigned int cpu = dev->id;
1198 unsigned long i;
1199
1200 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
1201 return;
1202 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
1203 return;
1204 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
1205
1206 for (i = 0; i < num_cache_leaves; i++)
1207 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
1208 kobject_put(per_cpu(ici_cache_kobject, cpu));
1209 cpuid4_cache_sysfs_exit(cpu);
1210}
1211
1212static int cacheinfo_cpu_callback(struct notifier_block *nfb,
1213 unsigned long action, void *hcpu)
1214{
1215 unsigned int cpu = (unsigned long)hcpu;
1216 struct device *dev;
1217
1218 dev = get_cpu_device(cpu);
1219 switch (action) {
1220 case CPU_ONLINE:
1221 case CPU_ONLINE_FROZEN:
1222 cache_add_dev(dev);
1223 break;
1224 case CPU_DEAD:
1225 case CPU_DEAD_FROZEN:
1226 cache_remove_dev(dev);
1227 break;
1228 }
1229 return NOTIFY_OK;
1230}
1231
1232static struct notifier_block cacheinfo_cpu_notifier = {
1233 .notifier_call = cacheinfo_cpu_callback,
1234};
1235
1236static int __init cache_sysfs_init(void)
1237{
1238 int i, err = 0;
1239
1240 if (num_cache_leaves == 0)
1241 return 0;
1242
1243 cpu_notifier_register_begin();
1244 for_each_online_cpu(i) {
1245 struct device *dev = get_cpu_device(i);
1246
1247 err = cache_add_dev(dev);
1248 if (err)
1249 goto out;
1250 }
1251 __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
1252
1253out:
1254 cpu_notifier_register_done();
1255 return err;
1256}
1257
1258device_initcall(cache_sysfs_init);
1259
1260#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b46906767f..fe32074b865b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -14,6 +14,7 @@ enum severity_level {
14}; 14};
15 15
16#define ATTR_LEN 16 16#define ATTR_LEN 16
17#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
17 18
18/* One object for each MCE bank, shared by all CPUs */ 19/* One object for each MCE bank, shared by all CPUs */
19struct mce_bank { 20struct mce_bank {
@@ -23,20 +24,20 @@ struct mce_bank {
23 char attrname[ATTR_LEN]; /* attribute name */ 24 char attrname[ATTR_LEN]; /* attribute name */
24}; 25};
25 26
26int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); 27extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
27struct dentry *mce_get_debugfs_dir(void); 28struct dentry *mce_get_debugfs_dir(void);
28 29
29extern struct mce_bank *mce_banks; 30extern struct mce_bank *mce_banks;
30extern mce_banks_t mce_banks_ce_disabled; 31extern mce_banks_t mce_banks_ce_disabled;
31 32
32#ifdef CONFIG_X86_MCE_INTEL 33#ifdef CONFIG_X86_MCE_INTEL
33unsigned long mce_intel_adjust_timer(unsigned long interval); 34unsigned long cmci_intel_adjust_timer(unsigned long interval);
34void mce_intel_cmci_poll(void); 35bool mce_intel_cmci_poll(void);
35void mce_intel_hcpu_update(unsigned long cpu); 36void mce_intel_hcpu_update(unsigned long cpu);
36void cmci_disable_bank(int bank); 37void cmci_disable_bank(int bank);
37#else 38#else
38# define mce_intel_adjust_timer mce_adjust_timer_default 39# define cmci_intel_adjust_timer mce_adjust_timer_default
39static inline void mce_intel_cmci_poll(void) { } 40static inline bool mce_intel_cmci_poll(void) { return false; }
40static inline void mce_intel_hcpu_update(unsigned long cpu) { } 41static inline void mce_intel_hcpu_update(unsigned long cpu) { }
41static inline void cmci_disable_bank(int bank) { } 42static inline void cmci_disable_bank(int bank) { }
42#endif 43#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..9c682c222071 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,7 +186,61 @@ static int error_context(struct mce *m)
186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
187} 187}
188 188
189int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) 189/*
190 * See AMD Error Scope Hierarchy table in a newer BKDG. For example
191 * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
192 */
193static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
194{
195 enum context ctx = error_context(m);
196
197 /* Processor Context Corrupt, no need to fumble too much, die! */
198 if (m->status & MCI_STATUS_PCC)
199 return MCE_PANIC_SEVERITY;
200
201 if (m->status & MCI_STATUS_UC) {
202
203 /*
204 * On older systems where overflow_recov flag is not present, we
205 * should simply panic if an error overflow occurs. If
206 * overflow_recov flag is present and set, then software can try
207 * to at least kill process to prolong system operation.
208 */
209 if (mce_flags.overflow_recov) {
210 /* software can try to contain */
211 if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
212 return MCE_PANIC_SEVERITY;
213
214 /* kill current process */
215 return MCE_AR_SEVERITY;
216 } else {
217 /* at least one error was not logged */
218 if (m->status & MCI_STATUS_OVER)
219 return MCE_PANIC_SEVERITY;
220 }
221
222 /*
223 * For any other case, return MCE_UC_SEVERITY so that we log the
224 * error and exit #MC handler.
225 */
226 return MCE_UC_SEVERITY;
227 }
228
229 /*
230 * deferred error: poll handler catches these and adds to mce_ring so
231 * memory-failure can take recovery actions.
232 */
233 if (m->status & MCI_STATUS_DEFERRED)
234 return MCE_DEFERRED_SEVERITY;
235
236 /*
237 * corrected error: poll handler catches these and passes responsibility
238 * of decoding the error to EDAC
239 */
240 return MCE_KEEP_SEVERITY;
241}
242
243static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
190{ 244{
191 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); 245 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
192 enum context ctx = error_context(m); 246 enum context ctx = error_context(m);
@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
216 } 270 }
217} 271}
218 272
273/* Default to mce_severity_intel */
274int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
275 mce_severity_intel;
276
277void __init mcheck_vendor_init_severity(void)
278{
279 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
280 mce_severity = mce_severity_amd;
281}
282
219#ifdef CONFIG_DEBUG_FS 283#ifdef CONFIG_DEBUG_FS
220static void *s_start(struct seq_file *f, loff_t *pos) 284static void *s_start(struct seq_file *f, loff_t *pos)
221{ 285{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3c036cb4a370..e535533d5ab8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
60#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
61#include <trace/events/mce.h> 61#include <trace/events/mce.h>
62 62
63#define SPINUNIT 100 /* 100ns */ 63#define SPINUNIT 100 /* 100ns */
64 64
65DEFINE_PER_CPU(unsigned, mce_exception_count); 65DEFINE_PER_CPU(unsigned, mce_exception_count);
66 66
67struct mce_bank *mce_banks __read_mostly; 67struct mce_bank *mce_banks __read_mostly;
68struct mce_vendor_flags mce_flags __read_mostly;
68 69
69struct mca_config mca_cfg __read_mostly = { 70struct mca_config mca_cfg __read_mostly = {
70 .bootlog = -1, 71 .bootlog = -1,
@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
89static DEFINE_PER_CPU(struct mce, mces_seen); 90static DEFINE_PER_CPU(struct mce, mces_seen);
90static int cpu_missing; 91static int cpu_missing;
91 92
92/* CMCI storm detection filter */
93static DEFINE_PER_CPU(unsigned long, mce_polled_error);
94
95/* 93/*
96 * MCA banks polled by the period polling timer for corrected events. 94 * MCA banks polled by the period polling timer for corrected events.
97 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 95 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
622 * is already totally * confused. In this case it's likely it will 620 * is already totally * confused. In this case it's likely it will
623 * not fully execute the machine check handler either. 621 * not fully execute the machine check handler either.
624 */ 622 */
625void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 623bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
626{ 624{
625 bool error_logged = false;
627 struct mce m; 626 struct mce m;
628 int severity; 627 int severity;
629 int i; 628 int i;
@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
646 if (!(m.status & MCI_STATUS_VAL)) 645 if (!(m.status & MCI_STATUS_VAL))
647 continue; 646 continue;
648 647
649 this_cpu_write(mce_polled_error, 1); 648
650 /* 649 /*
651 * Uncorrected or signalled events are handled by the exception 650 * Uncorrected or signalled events are handled by the exception
652 * handler when it is enabled, so don't process those here. 651 * handler when it is enabled, so don't process those here.
@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
679 * Don't get the IP here because it's unlikely to 678 * Don't get the IP here because it's unlikely to
680 * have anything to do with the actual error location. 679 * have anything to do with the actual error location.
681 */ 680 */
682 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 681 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
682 error_logged = true;
683 mce_log(&m); 683 mce_log(&m);
684 }
684 685
685 /* 686 /*
686 * Clear state for this bank. 687 * Clear state for this bank.
@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
694 */ 695 */
695 696
696 sync_core(); 697 sync_core();
698
699 return error_logged;
697} 700}
698EXPORT_SYMBOL_GPL(machine_check_poll); 701EXPORT_SYMBOL_GPL(machine_check_poll);
699 702
@@ -813,7 +816,7 @@ static void mce_reign(void)
813 * other CPUs. 816 * other CPUs.
814 */ 817 */
815 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) 818 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
816 mce_panic("Fatal Machine check", m, msg); 819 mce_panic("Fatal machine check", m, msg);
817 820
818 /* 821 /*
819 * For UC somewhere we let the CPU who detects it handle it. 822 * For UC somewhere we let the CPU who detects it handle it.
@@ -826,7 +829,7 @@ static void mce_reign(void)
826 * source or one CPU is hung. Panic. 829 * source or one CPU is hung. Panic.
827 */ 830 */
828 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) 831 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
829 mce_panic("Machine check from unknown source", NULL, NULL); 832 mce_panic("Fatal machine check from unknown source", NULL, NULL);
830 833
831 /* 834 /*
832 * Now clear all the mces_seen so that they don't reappear on 835 * Now clear all the mces_seen so that they don't reappear on
@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status)
1258 * poller finds an MCE, poll 2x faster. When the poller finds no more 1261 * poller finds an MCE, poll 2x faster. When the poller finds no more
1259 * errors, poll 2x slower (up to check_interval seconds). 1262 * errors, poll 2x slower (up to check_interval seconds).
1260 */ 1263 */
1261static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1264static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1262 1265
1263static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1266static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1264static DEFINE_PER_CPU(struct timer_list, mce_timer); 1267static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
1268 return interval; 1271 return interval;
1269} 1272}
1270 1273
1271static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1274static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1272 mce_adjust_timer_default;
1273 1275
1274static int cmc_error_seen(void) 1276static void __restart_timer(struct timer_list *t, unsigned long interval)
1275{ 1277{
1276 unsigned long *v = this_cpu_ptr(&mce_polled_error); 1278 unsigned long when = jiffies + interval;
1279 unsigned long flags;
1280
1281 local_irq_save(flags);
1277 1282
1278 return test_and_clear_bit(0, v); 1283 if (timer_pending(t)) {
1284 if (time_before(when, t->expires))
1285 mod_timer_pinned(t, when);
1286 } else {
1287 t->expires = round_jiffies(when);
1288 add_timer_on(t, smp_processor_id());
1289 }
1290
1291 local_irq_restore(flags);
1279} 1292}
1280 1293
1281static void mce_timer_fn(unsigned long data) 1294static void mce_timer_fn(unsigned long data)
1282{ 1295{
1283 struct timer_list *t = this_cpu_ptr(&mce_timer); 1296 struct timer_list *t = this_cpu_ptr(&mce_timer);
1297 int cpu = smp_processor_id();
1284 unsigned long iv; 1298 unsigned long iv;
1285 int notify;
1286 1299
1287 WARN_ON(smp_processor_id() != data); 1300 WARN_ON(cpu != data);
1301
1302 iv = __this_cpu_read(mce_next_interval);
1288 1303
1289 if (mce_available(this_cpu_ptr(&cpu_info))) { 1304 if (mce_available(this_cpu_ptr(&cpu_info))) {
1290 machine_check_poll(MCP_TIMESTAMP, 1305 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1291 this_cpu_ptr(&mce_poll_banks)); 1306
1292 mce_intel_cmci_poll(); 1307 if (mce_intel_cmci_poll()) {
1308 iv = mce_adjust_timer(iv);
1309 goto done;
1310 }
1293 } 1311 }
1294 1312
1295 /* 1313 /*
1296 * Alert userspace if needed. If we logged an MCE, reduce the 1314 * Alert userspace if needed. If we logged an MCE, reduce the polling
1297 * polling interval, otherwise increase the polling interval. 1315 * interval, otherwise increase the polling interval.
1298 */ 1316 */
1299 iv = __this_cpu_read(mce_next_interval); 1317 if (mce_notify_irq())
1300 notify = mce_notify_irq();
1301 notify |= cmc_error_seen();
1302 if (notify) {
1303 iv = max(iv / 2, (unsigned long) HZ/100); 1318 iv = max(iv / 2, (unsigned long) HZ/100);
1304 } else { 1319 else
1305 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1320 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1306 iv = mce_adjust_timer(iv); 1321
1307 } 1322done:
1308 __this_cpu_write(mce_next_interval, iv); 1323 __this_cpu_write(mce_next_interval, iv);
1309 /* Might have become 0 after CMCI storm subsided */ 1324 __restart_timer(t, iv);
1310 if (iv) {
1311 t->expires = jiffies + iv;
1312 add_timer_on(t, smp_processor_id());
1313 }
1314} 1325}
1315 1326
1316/* 1327/*
@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data)
1319void mce_timer_kick(unsigned long interval) 1330void mce_timer_kick(unsigned long interval)
1320{ 1331{
1321 struct timer_list *t = this_cpu_ptr(&mce_timer); 1332 struct timer_list *t = this_cpu_ptr(&mce_timer);
1322 unsigned long when = jiffies + interval;
1323 unsigned long iv = __this_cpu_read(mce_next_interval); 1333 unsigned long iv = __this_cpu_read(mce_next_interval);
1324 1334
1325 if (timer_pending(t)) { 1335 __restart_timer(t, interval);
1326 if (time_before(when, t->expires)) 1336
1327 mod_timer_pinned(t, when);
1328 } else {
1329 t->expires = round_jiffies(when);
1330 add_timer_on(t, smp_processor_id());
1331 }
1332 if (interval < iv) 1337 if (interval < iv)
1333 __this_cpu_write(mce_next_interval, interval); 1338 __this_cpu_write(mce_next_interval, interval);
1334} 1339}
@@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1525 * Various K7s with broken bank 0 around. Always disable 1530 * Various K7s with broken bank 0 around. Always disable
1526 * by default. 1531 * by default.
1527 */ 1532 */
1528 if (c->x86 == 6 && cfg->banks > 0) 1533 if (c->x86 == 6 && cfg->banks > 0)
1529 mce_banks[0].ctl = 0; 1534 mce_banks[0].ctl = 0;
1530 1535
1531 /* 1536 /*
1532 * Turn off MC4_MISC thresholding banks on those models since 1537 * overflow_recov is supported for F15h Models 00h-0fh
1533 * they're not supported there. 1538 * even though we don't have a CPUID bit for it.
1534 */ 1539 */
1535 if (c->x86 == 0x15 && 1540 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1536 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1541 mce_flags.overflow_recov = 1;
1537 int i; 1542
1538 u64 val, hwcr; 1543 /*
1539 bool need_toggle; 1544 * Turn off MC4_MISC thresholding banks on those models since
1540 u32 msrs[] = { 1545 * they're not supported there.
1546 */
1547 if (c->x86 == 0x15 &&
1548 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1549 int i;
1550 u64 hwcr;
1551 bool need_toggle;
1552 u32 msrs[] = {
1541 0x00000413, /* MC4_MISC0 */ 1553 0x00000413, /* MC4_MISC0 */
1542 0xc0000408, /* MC4_MISC1 */ 1554 0xc0000408, /* MC4_MISC1 */
1543 }; 1555 };
1544 1556
1545 rdmsrl(MSR_K7_HWCR, hwcr); 1557 rdmsrl(MSR_K7_HWCR, hwcr);
1546 1558
1547 /* McStatusWrEn has to be set */ 1559 /* McStatusWrEn has to be set */
1548 need_toggle = !(hwcr & BIT(18)); 1560 need_toggle = !(hwcr & BIT(18));
1549 1561
1550 if (need_toggle) 1562 if (need_toggle)
1551 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1563 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1552 1564
1553 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1565 /* Clear CntP bit safely */
1554 rdmsrl(msrs[i], val); 1566 for (i = 0; i < ARRAY_SIZE(msrs); i++)
1567 msr_clear_bit(msrs[i], 62);
1555 1568
1556 /* CntP bit set? */ 1569 /* restore old settings */
1557 if (val & BIT_64(62)) { 1570 if (need_toggle)
1558 val &= ~BIT_64(62); 1571 wrmsrl(MSR_K7_HWCR, hwcr);
1559 wrmsrl(msrs[i], val); 1572 }
1560 }
1561 }
1562
1563 /* restore old settings */
1564 if (need_toggle)
1565 wrmsrl(MSR_K7_HWCR, hwcr);
1566 }
1567 } 1573 }
1568 1574
1569 if (c->x86_vendor == X86_VENDOR_INTEL) { 1575 if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1629 switch (c->x86_vendor) { 1635 switch (c->x86_vendor) {
1630 case X86_VENDOR_INTEL: 1636 case X86_VENDOR_INTEL:
1631 mce_intel_feature_init(c); 1637 mce_intel_feature_init(c);
1632 mce_adjust_timer = mce_intel_adjust_timer; 1638 mce_adjust_timer = cmci_intel_adjust_timer;
1633 break; 1639 break;
1634 case X86_VENDOR_AMD: 1640 case X86_VENDOR_AMD:
1635 mce_amd_feature_init(c); 1641 mce_amd_feature_init(c);
1642 mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
1636 break; 1643 break;
1637 default: 1644 default:
1638 break; 1645 break;
@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable);
2017int __init mcheck_init(void) 2024int __init mcheck_init(void)
2018{ 2025{
2019 mcheck_intel_therm_init(); 2026 mcheck_intel_therm_init();
2027 mcheck_vendor_init_severity();
2020 2028
2021 return 0; 2029 return 0;
2022} 2030}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f1c3769bbd64..55ad9b37cae8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
79 return (bank == 4); 79 return (bank == 4);
80} 80}
81 81
82static const char * const bank4_names(struct threshold_block *b) 82static const char *bank4_names(const struct threshold_block *b)
83{ 83{
84 switch (b->address) { 84 switch (b->address) {
85 /* MSR4_MISC0 */ 85 /* MSR4_MISC0 */
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
250 if (!b.interrupt_capable) 250 if (!b.interrupt_capable)
251 goto init; 251 goto init;
252 252
253 b.interrupt_enable = 1;
253 new = (high & MASK_LVTOFF_HI) >> 20; 254 new = (high & MASK_LVTOFF_HI) >> 20;
254 offset = setup_APIC_mce(offset, new); 255 offset = setup_APIC_mce(offset, new);
255 256
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
322log: 323log:
323 mce_setup(&m); 324 mce_setup(&m);
324 rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); 325 rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
326 if (!(m.status & MCI_STATUS_VAL))
327 return;
325 m.misc = ((u64)high << 32) | low; 328 m.misc = ((u64)high << 32) | low;
326 m.bank = bank; 329 m.bank = bank;
327 mce_log(&m); 330 mce_log(&m);
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
497 b->interrupt_capable = lvt_interrupt_supported(bank, high); 500 b->interrupt_capable = lvt_interrupt_supported(bank, high);
498 b->threshold_limit = THRESHOLD_MAX; 501 b->threshold_limit = THRESHOLD_MAX;
499 502
500 if (b->interrupt_capable) 503 if (b->interrupt_capable) {
501 threshold_ktype.default_attrs[2] = &interrupt_enable.attr; 504 threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
502 else 505 b->interrupt_enable = 1;
506 } else {
503 threshold_ktype.default_attrs[2] = NULL; 507 threshold_ktype.default_attrs[2] = NULL;
508 }
504 509
505 INIT_LIST_HEAD(&b->miscj); 510 INIT_LIST_HEAD(&b->miscj);
506 511
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..b4a41cf030ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -39,6 +39,15 @@
39static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); 39static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
40 40
41/* 41/*
42 * CMCI storm detection backoff counter
43 *
44 * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
45 * encountered an error. If not, we decrement it by one. We signal the end of
46 * the CMCI storm when it reaches 0.
47 */
48static DEFINE_PER_CPU(int, cmci_backoff_cnt);
49
50/*
42 * cmci_discover_lock protects against parallel discovery attempts 51 * cmci_discover_lock protects against parallel discovery attempts
43 * which could race against each other. 52 * which could race against each other.
44 */ 53 */
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
46 55
47#define CMCI_THRESHOLD 1 56#define CMCI_THRESHOLD 1
48#define CMCI_POLL_INTERVAL (30 * HZ) 57#define CMCI_POLL_INTERVAL (30 * HZ)
49#define CMCI_STORM_INTERVAL (1 * HZ) 58#define CMCI_STORM_INTERVAL (HZ)
50#define CMCI_STORM_THRESHOLD 15 59#define CMCI_STORM_THRESHOLD 15
51 60
52static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); 61static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
82 return !!(cap & MCG_CMCI_P); 91 return !!(cap & MCG_CMCI_P);
83} 92}
84 93
85void mce_intel_cmci_poll(void) 94bool mce_intel_cmci_poll(void)
86{ 95{
87 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) 96 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
88 return; 97 return false;
89 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 98
99 /*
100 * Reset the counter if we've logged an error in the last poll
101 * during the storm.
102 */
103 if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
104 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
105 else
106 this_cpu_dec(cmci_backoff_cnt);
107
108 return true;
90} 109}
91 110
92void mce_intel_hcpu_update(unsigned long cpu) 111void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
97 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; 116 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
98} 117}
99 118
100unsigned long mce_intel_adjust_timer(unsigned long interval) 119unsigned long cmci_intel_adjust_timer(unsigned long interval)
101{ 120{
102 int r; 121 if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
103 122 (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
104 if (interval < CMCI_POLL_INTERVAL) 123 mce_notify_irq();
105 return interval; 124 return CMCI_STORM_INTERVAL;
125 }
106 126
107 switch (__this_cpu_read(cmci_storm_state)) { 127 switch (__this_cpu_read(cmci_storm_state)) {
108 case CMCI_STORM_ACTIVE: 128 case CMCI_STORM_ACTIVE:
129
109 /* 130 /*
110 * We switch back to interrupt mode once the poll timer has 131 * We switch back to interrupt mode once the poll timer has
111 * silenced itself. That means no events recorded and the 132 * silenced itself. That means no events recorded and the timer
112 * timer interval is back to our poll interval. 133 * interval is back to our poll interval.
113 */ 134 */
114 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); 135 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
115 r = atomic_sub_return(1, &cmci_storm_on_cpus); 136 if (!atomic_sub_return(1, &cmci_storm_on_cpus))
116 if (r == 0)
117 pr_notice("CMCI storm subsided: switching to interrupt mode\n"); 137 pr_notice("CMCI storm subsided: switching to interrupt mode\n");
138
118 /* FALLTHROUGH */ 139 /* FALLTHROUGH */
119 140
120 case CMCI_STORM_SUBSIDED: 141 case CMCI_STORM_SUBSIDED:
121 /* 142 /*
122 * We wait for all cpus to go back to SUBSIDED 143 * We wait for all CPUs to go back to SUBSIDED state. When that
123 * state. When that happens we switch back to 144 * happens we switch back to interrupt mode.
124 * interrupt mode.
125 */ 145 */
126 if (!atomic_read(&cmci_storm_on_cpus)) { 146 if (!atomic_read(&cmci_storm_on_cpus)) {
127 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); 147 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
130 } 150 }
131 return CMCI_POLL_INTERVAL; 151 return CMCI_POLL_INTERVAL;
132 default: 152 default:
133 /* 153
134 * We have shiny weather. Let the poll do whatever it 154 /* We have shiny weather. Let the poll do whatever it thinks. */
135 * thinks.
136 */
137 return interval; 155 return interval;
138 } 156 }
139} 157}
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
178 cmci_storm_disable_banks(); 196 cmci_storm_disable_banks();
179 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); 197 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
180 r = atomic_add_return(1, &cmci_storm_on_cpus); 198 r = atomic_add_return(1, &cmci_storm_on_cpus);
181 mce_timer_kick(CMCI_POLL_INTERVAL); 199 mce_timer_kick(CMCI_STORM_INTERVAL);
200 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
182 201
183 if (r == 1) 202 if (r == 1)
184 pr_notice("CMCI storm detected: switching to poll mode\n"); 203 pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
195{ 214{
196 if (cmci_storm_detect()) 215 if (cmci_storm_detect())
197 return; 216 return;
217
198 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 218 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
199 mce_notify_irq(); 219 mce_notify_irq();
200} 220}
@@ -286,6 +306,7 @@ void cmci_recheck(void)
286 306
287 if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) 307 if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
288 return; 308 return;
309
289 local_irq_save(flags); 310 local_irq_save(flags);
290 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 311 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
291 local_irq_restore(flags); 312 local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bfbbe6195e2d..12829c3ced3c 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -21,7 +21,6 @@
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 22
23#include <linux/firmware.h> 23#include <linux/firmware.h>
24#include <linux/pci_ids.h>
25#include <linux/uaccess.h> 24#include <linux/uaccess.h>
26#include <linux/vmalloc.h> 25#include <linux/vmalloc.h>
27#include <linux/kernel.h> 26#include <linux/kernel.h>
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index d45df4bd16ab..a413a69cbd74 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -23,57 +23,6 @@
23#include <asm/processor.h> 23#include <asm/processor.h>
24#include <asm/cmdline.h> 24#include <asm/cmdline.h>
25 25
26#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
27#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
28#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
29#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
30#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
31#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
32#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
33
34#define CPUID_IS(a, b, c, ebx, ecx, edx) \
35 (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
36
37/*
38 * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
39 * x86_vendor() gets vendor id for BSP.
40 *
41 * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
42 * coding, we still use x86_vendor() to get vendor id for AP.
43 *
44 * x86_vendor() gets vendor information directly through cpuid.
45 */
46static int x86_vendor(void)
47{
48 u32 eax = 0x00000000;
49 u32 ebx, ecx = 0, edx;
50
51 native_cpuid(&eax, &ebx, &ecx, &edx);
52
53 if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
54 return X86_VENDOR_INTEL;
55
56 if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
57 return X86_VENDOR_AMD;
58
59 return X86_VENDOR_UNKNOWN;
60}
61
62static int x86_family(void)
63{
64 u32 eax = 0x00000001;
65 u32 ebx, ecx = 0, edx;
66 int x86;
67
68 native_cpuid(&eax, &ebx, &ecx, &edx);
69
70 x86 = (eax >> 8) & 0xf;
71 if (x86 == 15)
72 x86 += (eax >> 20) & 0xff;
73
74 return x86;
75}
76
77static bool __init check_loader_disabled_bsp(void) 26static bool __init check_loader_disabled_bsp(void)
78{ 27{
79#ifdef CONFIG_X86_32 28#ifdef CONFIG_X86_32
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void)
96 45
97void __init load_ucode_bsp(void) 46void __init load_ucode_bsp(void)
98{ 47{
99 int vendor, x86; 48 int vendor, family;
100 49
101 if (check_loader_disabled_bsp()) 50 if (check_loader_disabled_bsp())
102 return; 51 return;
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void)
105 return; 54 return;
106 55
107 vendor = x86_vendor(); 56 vendor = x86_vendor();
108 x86 = x86_family(); 57 family = x86_family();
109 58
110 switch (vendor) { 59 switch (vendor) {
111 case X86_VENDOR_INTEL: 60 case X86_VENDOR_INTEL:
112 if (x86 >= 6) 61 if (family >= 6)
113 load_ucode_intel_bsp(); 62 load_ucode_intel_bsp();
114 break; 63 break;
115 case X86_VENDOR_AMD: 64 case X86_VENDOR_AMD:
116 if (x86 >= 0x10) 65 if (family >= 0x10)
117 load_ucode_amd_bsp(); 66 load_ucode_amd_bsp();
118 break; 67 break;
119 default: 68 default:
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void)
132 81
133void load_ucode_ap(void) 82void load_ucode_ap(void)
134{ 83{
135 int vendor, x86; 84 int vendor, family;
136 85
137 if (check_loader_disabled_ap()) 86 if (check_loader_disabled_ap())
138 return; 87 return;
@@ -141,15 +90,15 @@ void load_ucode_ap(void)
141 return; 90 return;
142 91
143 vendor = x86_vendor(); 92 vendor = x86_vendor();
144 x86 = x86_family(); 93 family = x86_family();
145 94
146 switch (vendor) { 95 switch (vendor) {
147 case X86_VENDOR_INTEL: 96 case X86_VENDOR_INTEL:
148 if (x86 >= 6) 97 if (family >= 6)
149 load_ucode_intel_ap(); 98 load_ucode_intel_ap();
150 break; 99 break;
151 case X86_VENDOR_AMD: 100 case X86_VENDOR_AMD:
152 if (x86 >= 0x10) 101 if (family >= 0x10)
153 load_ucode_amd_ap(); 102 load_ucode_amd_ap();
154 break; 103 break;
155 default: 104 default:
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void)
179 128
180void reload_early_microcode(void) 129void reload_early_microcode(void)
181{ 130{
182 int vendor, x86; 131 int vendor, family;
183 132
184 vendor = x86_vendor(); 133 vendor = x86_vendor();
185 x86 = x86_family(); 134 family = x86_family();
186 135
187 switch (vendor) { 136 switch (vendor) {
188 case X86_VENDOR_INTEL: 137 case X86_VENDOR_INTEL:
189 if (x86 >= 6) 138 if (family >= 6)
190 reload_ucode_intel(); 139 reload_ucode_intel();
191 break; 140 break;
192 case X86_VENDOR_AMD: 141 case X86_VENDOR_AMD:
193 if (x86 >= 0x10) 142 if (family >= 0x10)
194 reload_ucode_amd(); 143 reload_ucode_amd();
195 break; 144 break;
196 default: 145 default:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 746e7fd08aad..a41beadb3db9 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
124 cpf = cpu_sig.pf; 124 cpf = cpu_sig.pf;
125 crev = cpu_sig.rev; 125 crev = cpu_sig.rev;
126 126
127 return get_matching_microcode(csig, cpf, mc_intel, crev); 127 return get_matching_microcode(csig, cpf, crev, mc_intel);
128} 128}
129 129
130static int apply_microcode_intel(int cpu) 130static int apply_microcode_intel(int cpu)
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
226 226
227 csig = uci->cpu_sig.sig; 227 csig = uci->cpu_sig.sig;
228 cpf = uci->cpu_sig.pf; 228 cpf = uci->cpu_sig.pf;
229 if (get_matching_microcode(csig, cpf, mc, new_rev)) { 229 if (get_matching_microcode(csig, cpf, new_rev, mc)) {
230 vfree(new_mc); 230 vfree(new_mc);
231 new_rev = mc_header.rev; 231 new_rev = mc_header.rev;
232 new_mc = mc; 232 new_mc = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 420eb933189c..2f49ab4ac0ae 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -16,6 +16,14 @@
16 * as published by the Free Software Foundation; either version 16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version. 17 * 2 of the License, or (at your option) any later version.
18 */ 18 */
19
20/*
21 * This needs to be before all headers so that pr_debug in printk.h doesn't turn
22 * printk calls into no_printk().
23 *
24 *#define DEBUG
25 */
26
19#include <linux/module.h> 27#include <linux/module.h>
20#include <linux/mm.h> 28#include <linux/mm.h>
21#include <linux/slab.h> 29#include <linux/slab.h>
@@ -28,6 +36,9 @@
28#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
29#include <asm/setup.h> 37#include <asm/setup.h>
30 38
39#undef pr_fmt
40#define pr_fmt(fmt) "microcode: " fmt
41
31static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; 42static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
32static struct mc_saved_data { 43static struct mc_saved_data {
33 unsigned int mc_saved_count; 44 unsigned int mc_saved_count;
@@ -35,50 +46,45 @@ static struct mc_saved_data {
35} mc_saved_data; 46} mc_saved_data;
36 47
37static enum ucode_state 48static enum ucode_state
38generic_load_microcode_early(struct microcode_intel **mc_saved_p, 49load_microcode_early(struct microcode_intel **saved,
39 unsigned int mc_saved_count, 50 unsigned int num_saved, struct ucode_cpu_info *uci)
40 struct ucode_cpu_info *uci)
41{ 51{
42 struct microcode_intel *ucode_ptr, *new_mc = NULL; 52 struct microcode_intel *ucode_ptr, *new_mc = NULL;
43 int new_rev = uci->cpu_sig.rev; 53 struct microcode_header_intel *mc_hdr;
44 enum ucode_state state = UCODE_OK; 54 int new_rev, ret, i;
45 unsigned int mc_size;
46 struct microcode_header_intel *mc_header;
47 unsigned int csig = uci->cpu_sig.sig;
48 unsigned int cpf = uci->cpu_sig.pf;
49 int i;
50 55
51 for (i = 0; i < mc_saved_count; i++) { 56 new_rev = uci->cpu_sig.rev;
52 ucode_ptr = mc_saved_p[i];
53 57
54 mc_header = (struct microcode_header_intel *)ucode_ptr; 58 for (i = 0; i < num_saved; i++) {
55 mc_size = get_totalsize(mc_header); 59 ucode_ptr = saved[i];
56 if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { 60 mc_hdr = (struct microcode_header_intel *)ucode_ptr;
57 new_rev = mc_header->rev;
58 new_mc = ucode_ptr;
59 }
60 }
61 61
62 if (!new_mc) { 62 ret = get_matching_microcode(uci->cpu_sig.sig,
63 state = UCODE_NFOUND; 63 uci->cpu_sig.pf,
64 goto out; 64 new_rev,
65 ucode_ptr);
66 if (!ret)
67 continue;
68
69 new_rev = mc_hdr->rev;
70 new_mc = ucode_ptr;
65 } 71 }
66 72
73 if (!new_mc)
74 return UCODE_NFOUND;
75
67 uci->mc = (struct microcode_intel *)new_mc; 76 uci->mc = (struct microcode_intel *)new_mc;
68out: 77 return UCODE_OK;
69 return state;
70} 78}
71 79
72static void 80static inline void
73microcode_pointer(struct microcode_intel **mc_saved, 81copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
74 unsigned long *mc_saved_in_initrd, 82 unsigned long off, int num_saved)
75 unsigned long initrd_start, int mc_saved_count)
76{ 83{
77 int i; 84 int i;
78 85
79 for (i = 0; i < mc_saved_count; i++) 86 for (i = 0; i < num_saved; i++)
80 mc_saved[i] = (struct microcode_intel *) 87 mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
81 (mc_saved_in_initrd[i] + initrd_start);
82} 88}
83 89
84#ifdef CONFIG_X86_32 90#ifdef CONFIG_X86_32
@@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp,
102#endif 108#endif
103 109
104static enum ucode_state 110static enum ucode_state
105load_microcode(struct mc_saved_data *mc_saved_data, 111load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
106 unsigned long *mc_saved_in_initrd, 112 unsigned long initrd_start, struct ucode_cpu_info *uci)
107 unsigned long initrd_start,
108 struct ucode_cpu_info *uci)
109{ 113{
110 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; 114 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
111 unsigned int count = mc_saved_data->mc_saved_count; 115 unsigned int count = mc_saved_data->mc_saved_count;
112 116
113 if (!mc_saved_data->mc_saved) { 117 if (!mc_saved_data->mc_saved) {
114 microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, 118 copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
115 initrd_start, count);
116 119
117 return generic_load_microcode_early(mc_saved_tmp, count, uci); 120 return load_microcode_early(mc_saved_tmp, count, uci);
118 } else { 121 } else {
119#ifdef CONFIG_X86_32 122#ifdef CONFIG_X86_32
120 microcode_phys(mc_saved_tmp, mc_saved_data); 123 microcode_phys(mc_saved_tmp, mc_saved_data);
121 return generic_load_microcode_early(mc_saved_tmp, count, uci); 124 return load_microcode_early(mc_saved_tmp, count, uci);
122#else 125#else
123 return generic_load_microcode_early(mc_saved_data->mc_saved, 126 return load_microcode_early(mc_saved_data->mc_saved,
124 count, uci); 127 count, uci);
125#endif 128#endif
126 } 129 }
127} 130}
128 131
129static u8 get_x86_family(unsigned long sig)
130{
131 u8 x86;
132
133 x86 = (sig >> 8) & 0xf;
134
135 if (x86 == 0xf)
136 x86 += (sig >> 20) & 0xff;
137
138 return x86;
139}
140
141static u8 get_x86_model(unsigned long sig)
142{
143 u8 x86, x86_model;
144
145 x86 = get_x86_family(sig);
146 x86_model = (sig >> 4) & 0xf;
147
148 if (x86 == 0x6 || x86 == 0xf)
149 x86_model += ((sig >> 16) & 0xf) << 4;
150
151 return x86_model;
152}
153
154/* 132/*
155 * Given CPU signature and a microcode patch, this function finds if the 133 * Given CPU signature and a microcode patch, this function finds if the
156 * microcode patch has matching family and model with the CPU. 134 * microcode patch has matching family and model with the CPU.
@@ -159,42 +137,40 @@ static enum ucode_state
159matching_model_microcode(struct microcode_header_intel *mc_header, 137matching_model_microcode(struct microcode_header_intel *mc_header,
160 unsigned long sig) 138 unsigned long sig)
161{ 139{
162 u8 x86, x86_model; 140 unsigned int fam, model;
163 u8 x86_ucode, x86_model_ucode; 141 unsigned int fam_ucode, model_ucode;
164 struct extended_sigtable *ext_header; 142 struct extended_sigtable *ext_header;
165 unsigned long total_size = get_totalsize(mc_header); 143 unsigned long total_size = get_totalsize(mc_header);
166 unsigned long data_size = get_datasize(mc_header); 144 unsigned long data_size = get_datasize(mc_header);
167 int ext_sigcount, i; 145 int ext_sigcount, i;
168 struct extended_signature *ext_sig; 146 struct extended_signature *ext_sig;
169 147
170 x86 = get_x86_family(sig); 148 fam = __x86_family(sig);
171 x86_model = get_x86_model(sig); 149 model = x86_model(sig);
172 150
173 x86_ucode = get_x86_family(mc_header->sig); 151 fam_ucode = __x86_family(mc_header->sig);
174 x86_model_ucode = get_x86_model(mc_header->sig); 152 model_ucode = x86_model(mc_header->sig);
175 153
176 if (x86 == x86_ucode && x86_model == x86_model_ucode) 154 if (fam == fam_ucode && model == model_ucode)
177 return UCODE_OK; 155 return UCODE_OK;
178 156
179 /* Look for ext. headers: */ 157 /* Look for ext. headers: */
180 if (total_size <= data_size + MC_HEADER_SIZE) 158 if (total_size <= data_size + MC_HEADER_SIZE)
181 return UCODE_NFOUND; 159 return UCODE_NFOUND;
182 160
183 ext_header = (struct extended_sigtable *) 161 ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
184 mc_header + data_size + MC_HEADER_SIZE; 162 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
185 ext_sigcount = ext_header->count; 163 ext_sigcount = ext_header->count;
186 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
187 164
188 for (i = 0; i < ext_sigcount; i++) { 165 for (i = 0; i < ext_sigcount; i++) {
189 x86_ucode = get_x86_family(ext_sig->sig); 166 fam_ucode = __x86_family(ext_sig->sig);
190 x86_model_ucode = get_x86_model(ext_sig->sig); 167 model_ucode = x86_model(ext_sig->sig);
191 168
192 if (x86 == x86_ucode && x86_model == x86_model_ucode) 169 if (fam == fam_ucode && model == model_ucode)
193 return UCODE_OK; 170 return UCODE_OK;
194 171
195 ext_sig++; 172 ext_sig++;
196 } 173 }
197
198 return UCODE_NFOUND; 174 return UCODE_NFOUND;
199} 175}
200 176
@@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data,
204 unsigned int mc_saved_count) 180 unsigned int mc_saved_count)
205{ 181{
206 int i, j; 182 int i, j;
207 struct microcode_intel **mc_saved_p; 183 struct microcode_intel **saved_ptr;
208 int ret; 184 int ret;
209 185
210 if (!mc_saved_count) 186 if (!mc_saved_count)
@@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data,
213 /* 189 /*
214 * Copy new microcode data. 190 * Copy new microcode data.
215 */ 191 */
216 mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), 192 saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
217 GFP_KERNEL); 193 if (!saved_ptr)
218 if (!mc_saved_p)
219 return -ENOMEM; 194 return -ENOMEM;
220 195
221 for (i = 0; i < mc_saved_count; i++) { 196 for (i = 0; i < mc_saved_count; i++) {
222 struct microcode_intel *mc = mc_saved_src[i]; 197 struct microcode_header_intel *mc_hdr;
223 struct microcode_header_intel *mc_header = &mc->hdr; 198 struct microcode_intel *mc;
224 unsigned long mc_size = get_totalsize(mc_header); 199 unsigned long size;
225 mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); 200
226 if (!mc_saved_p[i]) {
227 ret = -ENOMEM;
228 goto err;
229 }
230 if (!mc_saved_src[i]) { 201 if (!mc_saved_src[i]) {
231 ret = -EINVAL; 202 ret = -EINVAL;
232 goto err; 203 goto err;
233 } 204 }
234 memcpy(mc_saved_p[i], mc, mc_size); 205
206 mc = mc_saved_src[i];
207 mc_hdr = &mc->hdr;
208 size = get_totalsize(mc_hdr);
209
210 saved_ptr[i] = kmalloc(size, GFP_KERNEL);
211 if (!saved_ptr[i]) {
212 ret = -ENOMEM;
213 goto err;
214 }
215
216 memcpy(saved_ptr[i], mc, size);
235 } 217 }
236 218
237 /* 219 /*
238 * Point to newly saved microcode. 220 * Point to newly saved microcode.
239 */ 221 */
240 mc_saved_data->mc_saved = mc_saved_p; 222 mc_saved_data->mc_saved = saved_ptr;
241 mc_saved_data->mc_saved_count = mc_saved_count; 223 mc_saved_data->mc_saved_count = mc_saved_count;
242 224
243 return 0; 225 return 0;
244 226
245err: 227err:
246 for (j = 0; j <= i; j++) 228 for (j = 0; j <= i; j++)
247 kfree(mc_saved_p[j]); 229 kfree(saved_ptr[j]);
248 kfree(mc_saved_p); 230 kfree(saved_ptr);
249 231
250 return ret; 232 return ret;
251} 233}
@@ -257,48 +239,45 @@ err:
257 * - or if it is a newly discovered microcode patch. 239 * - or if it is a newly discovered microcode patch.
258 * 240 *
259 * The microcode patch should have matching model with CPU. 241 * The microcode patch should have matching model with CPU.
242 *
243 * Returns: The updated number @num_saved of saved microcode patches.
260 */ 244 */
261static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, 245static unsigned int _save_mc(struct microcode_intel **mc_saved,
262 unsigned int *mc_saved_count_p) 246 u8 *ucode_ptr, unsigned int num_saved)
263{ 247{
264 int i; 248 struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
265 int found = 0; 249 unsigned int sig, pf, new_rev;
266 unsigned int mc_saved_count = *mc_saved_count_p; 250 int found = 0, i;
267 struct microcode_header_intel *mc_header; 251
252 mc_hdr = (struct microcode_header_intel *)ucode_ptr;
253
254 for (i = 0; i < num_saved; i++) {
255 mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
256 sig = mc_saved_hdr->sig;
257 pf = mc_saved_hdr->pf;
258 new_rev = mc_hdr->rev;
259
260 if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
261 continue;
262
263 found = 1;
264
265 if (!revision_is_newer(mc_hdr, new_rev))
266 continue;
268 267
269 mc_header = (struct microcode_header_intel *)ucode_ptr;
270 for (i = 0; i < mc_saved_count; i++) {
271 unsigned int sig, pf;
272 unsigned int new_rev;
273 struct microcode_header_intel *mc_saved_header =
274 (struct microcode_header_intel *)mc_saved[i];
275 sig = mc_saved_header->sig;
276 pf = mc_saved_header->pf;
277 new_rev = mc_header->rev;
278
279 if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
280 found = 1;
281 if (update_match_revision(mc_header, new_rev)) {
282 /*
283 * Found an older ucode saved before.
284 * Replace the older one with this newer
285 * one.
286 */
287 mc_saved[i] =
288 (struct microcode_intel *)ucode_ptr;
289 break;
290 }
291 }
292 }
293 if (i >= mc_saved_count && !found)
294 /* 268 /*
295 * This ucode is first time discovered in ucode file. 269 * Found an older ucode saved earlier. Replace it with
296 * Save it to memory. 270 * this newer one.
297 */ 271 */
298 mc_saved[mc_saved_count++] = 272 mc_saved[i] = (struct microcode_intel *)ucode_ptr;
299 (struct microcode_intel *)ucode_ptr; 273 break;
274 }
275
276 /* Newly detected microcode, save it to memory. */
277 if (i >= num_saved && !found)
278 mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
300 279
301 *mc_saved_count_p = mc_saved_count; 280 return num_saved;
302} 281}
303 282
304/* 283/*
@@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
346 continue; 325 continue;
347 } 326 }
348 327
349 _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); 328 mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
350 329
351 ucode_ptr += mc_size; 330 ucode_ptr += mc_size;
352 } 331 }
@@ -372,7 +351,7 @@ out:
372static int collect_cpu_info_early(struct ucode_cpu_info *uci) 351static int collect_cpu_info_early(struct ucode_cpu_info *uci)
373{ 352{
374 unsigned int val[2]; 353 unsigned int val[2];
375 u8 x86, x86_model; 354 unsigned int family, model;
376 struct cpu_signature csig; 355 struct cpu_signature csig;
377 unsigned int eax, ebx, ecx, edx; 356 unsigned int eax, ebx, ecx, edx;
378 357
@@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
387 native_cpuid(&eax, &ebx, &ecx, &edx); 366 native_cpuid(&eax, &ebx, &ecx, &edx);
388 csig.sig = eax; 367 csig.sig = eax;
389 368
390 x86 = get_x86_family(csig.sig); 369 family = __x86_family(csig.sig);
391 x86_model = get_x86_model(csig.sig); 370 model = x86_model(csig.sig);
392 371
393 if ((x86_model >= 5) || (x86 > 6)) { 372 if ((model >= 5) || (family > 6)) {
394 /* get processor flags from MSR 0x17 */ 373 /* get processor flags from MSR 0x17 */
395 native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); 374 native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
396 csig.pf = 1 << ((val[1] >> 18) & 7); 375 csig.pf = 1 << ((val[1] >> 18) & 7);
@@ -429,8 +408,7 @@ static void __ref show_saved_mc(void)
429 sig = uci.cpu_sig.sig; 408 sig = uci.cpu_sig.sig;
430 pf = uci.cpu_sig.pf; 409 pf = uci.cpu_sig.pf;
431 rev = uci.cpu_sig.rev; 410 rev = uci.cpu_sig.rev;
432 pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", 411 pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
433 smp_processor_id(), sig, pf, rev);
434 412
435 for (i = 0; i < mc_saved_data.mc_saved_count; i++) { 413 for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
436 struct microcode_header_intel *mc_saved_header; 414 struct microcode_header_intel *mc_saved_header;
@@ -457,8 +435,7 @@ static void __ref show_saved_mc(void)
457 if (total_size <= data_size + MC_HEADER_SIZE) 435 if (total_size <= data_size + MC_HEADER_SIZE)
458 continue; 436 continue;
459 437
460 ext_header = (struct extended_sigtable *) 438 ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
461 mc_saved_header + data_size + MC_HEADER_SIZE;
462 ext_sigcount = ext_header->count; 439 ext_sigcount = ext_header->count;
463 ext_sig = (void *)ext_header + EXT_HEADER_SIZE; 440 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
464 441
@@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc)
515 * Save the microcode patch mc in mc_save_tmp structure if it's a newer 492 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
516 * version. 493 * version.
517 */ 494 */
518 495 mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
519 _save_mc(mc_saved_tmp, mc, &mc_saved_count);
520 496
521 /* 497 /*
522 * Save the mc_save_tmp in global mc_saved_data. 498 * Save the mc_save_tmp in global mc_saved_data.
@@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
548 524
549static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; 525static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
550static __init enum ucode_state 526static __init enum ucode_state
551scan_microcode(unsigned long start, unsigned long end, 527scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
552 struct mc_saved_data *mc_saved_data, 528 unsigned long start, unsigned long size,
553 unsigned long *mc_saved_in_initrd, 529 struct ucode_cpu_info *uci)
554 struct ucode_cpu_info *uci)
555{ 530{
556 unsigned int size = end - start + 1;
557 struct cpio_data cd; 531 struct cpio_data cd;
558 long offset = 0; 532 long offset = 0;
559#ifdef CONFIG_X86_32 533#ifdef CONFIG_X86_32
@@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end,
569 if (!cd.data) 543 if (!cd.data)
570 return UCODE_ERROR; 544 return UCODE_ERROR;
571 545
572
573 return get_matching_model_microcode(0, start, cd.data, cd.size, 546 return get_matching_model_microcode(0, start, cd.data, cd.size,
574 mc_saved_data, mc_saved_in_initrd, 547 mc_saved_data, initrd, uci);
575 uci);
576} 548}
577 549
578/* 550/*
@@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void)
704 if (count == 0) 676 if (count == 0)
705 return ret; 677 return ret;
706 678
707 microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); 679 copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
708 ret = save_microcode(&mc_saved_data, mc_saved, count); 680 ret = save_microcode(&mc_saved_data, mc_saved, count);
709 if (ret) 681 if (ret)
710 pr_err("Cannot save microcode patches from initrd.\n"); 682 pr_err("Cannot save microcode patches from initrd.\n");
@@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void)
716 688
717static void __init 689static void __init
718_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, 690_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
719 unsigned long *mc_saved_in_initrd, 691 unsigned long *initrd,
720 unsigned long initrd_start_early, 692 unsigned long start, unsigned long size)
721 unsigned long initrd_end_early,
722 struct ucode_cpu_info *uci)
723{ 693{
694 struct ucode_cpu_info uci;
724 enum ucode_state ret; 695 enum ucode_state ret;
725 696
726 collect_cpu_info_early(uci); 697 collect_cpu_info_early(&uci);
727 scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
728 mc_saved_in_initrd, uci);
729 698
730 ret = load_microcode(mc_saved_data, mc_saved_in_initrd, 699 ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
731 initrd_start_early, uci); 700 if (ret != UCODE_OK)
701 return;
732 702
733 if (ret == UCODE_OK) 703 ret = load_microcode(mc_saved_data, initrd, start, &uci);
734 apply_microcode_early(uci, true); 704 if (ret != UCODE_OK)
705 return;
706
707 apply_microcode_early(&uci, true);
735} 708}
736 709
737void __init 710void __init load_ucode_intel_bsp(void)
738load_ucode_intel_bsp(void)
739{ 711{
740 u64 ramdisk_image, ramdisk_size; 712 u64 start, size;
741 unsigned long initrd_start_early, initrd_end_early;
742 struct ucode_cpu_info uci;
743#ifdef CONFIG_X86_32 713#ifdef CONFIG_X86_32
744 struct boot_params *boot_params_p; 714 struct boot_params *p;
745 715
746 boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); 716 p = (struct boot_params *)__pa_nodebug(&boot_params);
747 ramdisk_image = boot_params_p->hdr.ramdisk_image; 717 start = p->hdr.ramdisk_image;
748 ramdisk_size = boot_params_p->hdr.ramdisk_size; 718 size = p->hdr.ramdisk_size;
749 initrd_start_early = ramdisk_image;
750 initrd_end_early = initrd_start_early + ramdisk_size;
751 719
752 _load_ucode_intel_bsp( 720 _load_ucode_intel_bsp(
753 (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), 721 (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
754 (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), 722 (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
755 initrd_start_early, initrd_end_early, &uci); 723 start, size);
756#else 724#else
757 ramdisk_image = boot_params.hdr.ramdisk_image; 725 start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
758 ramdisk_size = boot_params.hdr.ramdisk_size; 726 size = boot_params.hdr.ramdisk_size;
759 initrd_start_early = ramdisk_image + PAGE_OFFSET; 727
760 initrd_end_early = initrd_start_early + ramdisk_size; 728 _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
761
762 _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
763 initrd_start_early, initrd_end_early,
764 &uci);
765#endif 729#endif
766} 730}
767 731
@@ -771,6 +735,7 @@ void load_ucode_intel_ap(void)
771 struct ucode_cpu_info uci; 735 struct ucode_cpu_info uci;
772 unsigned long *mc_saved_in_initrd_p; 736 unsigned long *mc_saved_in_initrd_p;
773 unsigned long initrd_start_addr; 737 unsigned long initrd_start_addr;
738 enum ucode_state ret;
774#ifdef CONFIG_X86_32 739#ifdef CONFIG_X86_32
775 unsigned long *initrd_start_p; 740 unsigned long *initrd_start_p;
776 741
@@ -793,8 +758,12 @@ void load_ucode_intel_ap(void)
793 return; 758 return;
794 759
795 collect_cpu_info_early(&uci); 760 collect_cpu_info_early(&uci);
796 load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, 761 ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
797 initrd_start_addr, &uci); 762 initrd_start_addr, &uci);
763
764 if (ret != UCODE_OK)
765 return;
766
798 apply_microcode_early(&uci, true); 767 apply_microcode_early(&uci, true);
799} 768}
800 769
@@ -808,8 +777,8 @@ void reload_ucode_intel(void)
808 777
809 collect_cpu_info_early(&uci); 778 collect_cpu_info_early(&uci);
810 779
811 ret = generic_load_microcode_early(mc_saved_data.mc_saved, 780 ret = load_microcode_early(mc_saved_data.mc_saved,
812 mc_saved_data.mc_saved_count, &uci); 781 mc_saved_data.mc_saved_count, &uci);
813 if (ret != UCODE_OK) 782 if (ret != UCODE_OK)
814 return; 783 return;
815 784
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..cd47a510a3f1 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf,
38 return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; 38 return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
39} 39}
40 40
41int
42update_match_revision(struct microcode_header_intel *mc_header, int rev)
43{
44 return (mc_header->rev <= rev) ? 0 : 1;
45}
46
47int microcode_sanity_check(void *mc, int print_err) 41int microcode_sanity_check(void *mc, int print_err)
48{ 42{
49 unsigned long total_size, data_size, ext_table_size; 43 unsigned long total_size, data_size, ext_table_size;
@@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err)
128EXPORT_SYMBOL_GPL(microcode_sanity_check); 122EXPORT_SYMBOL_GPL(microcode_sanity_check);
129 123
130/* 124/*
131 * return 0 - no update found 125 * Returns 1 if update has been found, 0 otherwise.
132 * return 1 - found update
133 */ 126 */
134int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) 127int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
135{ 128{
136 struct microcode_header_intel *mc_header = mc; 129 struct microcode_header_intel *mc_header = mc;
137 struct extended_sigtable *ext_header; 130 struct extended_sigtable *ext_header;
@@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
159} 152}
160 153
161/* 154/*
162 * return 0 - no update found 155 * Returns 1 if update has been found, 0 otherwise.
163 * return 1 - found update
164 */ 156 */
165int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) 157int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
166{ 158{
167 struct microcode_header_intel *mc_header = mc; 159 struct microcode_header_intel *mc_hdr = mc;
168 160
169 if (!update_match_revision(mc_header, rev)) 161 if (!revision_is_newer(mc_hdr, rev))
170 return 0; 162 return 0;
171 163
172 return get_matching_sig(csig, cpf, mc, rev); 164 return get_matching_sig(csig, cpf, rev, mc);
173} 165}
174EXPORT_SYMBOL_GPL(get_matching_microcode); 166EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 36d99a337b49..3f20710a5b23 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,7 +6,7 @@
6IN=$1 6IN=$1
7OUT=$2 7OUT=$2
8 8
9function dump_array() 9dump_array()
10{ 10{
11 ARRAY=$1 11 ARRAY=$1
12 SIZE=$2 12 SIZE=$2
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..d76f13d6d8d6 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = {
404static int mtrr_seq_show(struct seq_file *seq, void *offset) 404static int mtrr_seq_show(struct seq_file *seq, void *offset)
405{ 405{
406 char factor; 406 char factor;
407 int i, max, len; 407 int i, max;
408 mtrr_type type; 408 mtrr_type type;
409 unsigned long base, size; 409 unsigned long base, size;
410 410
411 len = 0;
412 max = num_var_ranges; 411 max = num_var_ranges;
413 for (i = 0; i < max; i++) { 412 for (i = 0; i < max; i++) {
414 mtrr_if->get(i, &base, &size, &type); 413 mtrr_if->get(i, &base, &size, &type);
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
425 size >>= 20 - PAGE_SHIFT; 424 size >>= 20 - PAGE_SHIFT;
426 } 425 }
427 /* Base can be > 32bit */ 426 /* Base can be > 32bit */
428 len += seq_printf(seq, "reg%02i: base=0x%06lx000 " 427 seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
429 "(%5luMB), size=%5lu%cB, count=%d: %s\n", 428 i, base, base >> (20 - PAGE_SHIFT),
430 i, base, base >> (20 - PAGE_SHIFT), size, 429 size, factor,
431 factor, mtrr_usage_table[i], 430 mtrr_usage_table[i], mtrr_attrib_to_str(type));
432 mtrr_attrib_to_str(type));
433 } 431 }
434 return 0; 432 return 0;
435} 433}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 689e35760924..87848ebe2bb7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2236,24 +2236,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
2236static unsigned long code_segment_base(struct pt_regs *regs) 2236static unsigned long code_segment_base(struct pt_regs *regs)
2237{ 2237{
2238 /* 2238 /*
2239 * For IA32 we look at the GDT/LDT segment base to convert the
2240 * effective IP to a linear address.
2241 */
2242
2243#ifdef CONFIG_X86_32
2244 /*
2239 * If we are in VM86 mode, add the segment offset to convert to a 2245 * If we are in VM86 mode, add the segment offset to convert to a
2240 * linear address. 2246 * linear address.
2241 */ 2247 */
2242 if (regs->flags & X86_VM_MASK) 2248 if (regs->flags & X86_VM_MASK)
2243 return 0x10 * regs->cs; 2249 return 0x10 * regs->cs;
2244 2250
2245 /*
2246 * For IA32 we look at the GDT/LDT segment base to convert the
2247 * effective IP to a linear address.
2248 */
2249#ifdef CONFIG_X86_32
2250 if (user_mode(regs) && regs->cs != __USER_CS) 2251 if (user_mode(regs) && regs->cs != __USER_CS)
2251 return get_segment_base(regs->cs); 2252 return get_segment_base(regs->cs);
2252#else 2253#else
2253 if (test_thread_flag(TIF_IA32)) { 2254 if (user_mode(regs) && !user_64bit_mode(regs) &&
2254 if (user_mode(regs) && regs->cs != __USER32_CS) 2255 regs->cs != __USER32_CS)
2255 return get_segment_base(regs->cs); 2256 return get_segment_base(regs->cs);
2256 }
2257#endif 2257#endif
2258 return 0; 2258 return 0;
2259} 2259}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index aceb2f90c716..c76d3e37c6e1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
105#ifdef CONFIG_X86_32 105#ifdef CONFIG_X86_32
106 struct pt_regs fixed_regs; 106 struct pt_regs fixed_regs;
107 107
108 if (!user_mode_vm(regs)) { 108 if (!user_mode(regs)) {
109 crash_fixup_ss_esp(&fixed_regs, regs); 109 crash_fixup_ss_esp(&fixed_regs, regs);
110 regs = &fixed_regs; 110 regs = &fixed_regs;
111 } 111 }
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3d3503351242..6367a780cc8c 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void)
286 initial_boot_params = dt = early_memremap(initial_dtb, map_len); 286 initial_boot_params = dt = early_memremap(initial_dtb, map_len);
287 size = of_get_flat_dt_size(); 287 size = of_get_flat_dt_size();
288 if (map_len < size) { 288 if (map_len < size) {
289 early_iounmap(dt, map_len); 289 early_memunmap(dt, map_len);
290 initial_boot_params = dt = early_memremap(initial_dtb, size); 290 initial_boot_params = dt = early_memremap(initial_dtb, size);
291 map_len = size; 291 map_len = size;
292 } 292 }
293 293
294 unflatten_and_copy_device_tree(); 294 unflatten_and_copy_device_tree();
295 early_iounmap(dt, map_len); 295 early_memunmap(dt, map_len);
296} 296}
297#else 297#else
298static inline void x86_flattree_get_config(void) { } 298static inline void x86_flattree_get_config(void) { }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d8d039..9c30acfadae2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -25,10 +25,12 @@ unsigned int code_bytes = 64;
25int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; 25int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
26static int die_counter; 26static int die_counter;
27 27
28static void printk_stack_address(unsigned long address, int reliable) 28static void printk_stack_address(unsigned long address, int reliable,
29 void *data)
29{ 30{
30 pr_cont(" [<%p>] %s%pB\n", 31 printk("%s [<%p>] %s%pB\n",
31 (void *)address, reliable ? "" : "? ", (void *)address); 32 (char *)data, (void *)address, reliable ? "" : "? ",
33 (void *)address);
32} 34}
33 35
34void printk_address(unsigned long address) 36void printk_address(unsigned long address)
@@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name)
155static void print_trace_address(void *data, unsigned long addr, int reliable) 157static void print_trace_address(void *data, unsigned long addr, int reliable)
156{ 158{
157 touch_nmi_watchdog(); 159 touch_nmi_watchdog();
158 printk(data); 160 printk_stack_address(addr, reliable, data);
159 printk_stack_address(addr, reliable);
160} 161}
161 162
162static const struct stacktrace_ops print_trace_ops = { 163static const struct stacktrace_ops print_trace_ops = {
@@ -278,7 +279,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
278 print_modules(); 279 print_modules();
279 show_regs(regs); 280 show_regs(regs);
280#ifdef CONFIG_X86_32 281#ifdef CONFIG_X86_32
281 if (user_mode_vm(regs)) { 282 if (user_mode(regs)) {
282 sp = regs->sp; 283 sp = regs->sp;
283 ss = regs->ss & 0xffff; 284 ss = regs->ss & 0xffff;
284 } else { 285 } else {
@@ -307,7 +308,7 @@ void die(const char *str, struct pt_regs *regs, long err)
307 unsigned long flags = oops_begin(); 308 unsigned long flags = oops_begin();
308 int sig = SIGSEGV; 309 int sig = SIGSEGV;
309 310
310 if (!user_mode_vm(regs)) 311 if (!user_mode(regs))
311 report_bug(regs->ip, regs); 312 report_bug(regs->ip, regs);
312 313
313 if (__die(str, regs, err)) 314 if (__die(str, regs, err))
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd4230c..464ffd69b92e 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
108 for (i = 0; i < kstack_depth_to_print; i++) { 108 for (i = 0; i < kstack_depth_to_print; i++) {
109 if (kstack_end(stack)) 109 if (kstack_end(stack))
110 break; 110 break;
111 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 111 if ((i % STACKSLOTS_PER_LINE) == 0) {
112 pr_cont("\n"); 112 if (i != 0)
113 pr_cont(" %08lx", *stack++); 113 pr_cont("\n");
114 printk("%s %08lx", log_lvl, *stack++);
115 } else
116 pr_cont(" %08lx", *stack++);
114 touch_nmi_watchdog(); 117 touch_nmi_watchdog();
115 } 118 }
116 pr_cont("\n"); 119 pr_cont("\n");
@@ -123,13 +126,13 @@ void show_regs(struct pt_regs *regs)
123 int i; 126 int i;
124 127
125 show_regs_print_info(KERN_EMERG); 128 show_regs_print_info(KERN_EMERG);
126 __show_regs(regs, !user_mode_vm(regs)); 129 __show_regs(regs, !user_mode(regs));
127 130
128 /* 131 /*
129 * When in-kernel, we also print out the stack and code at the 132 * When in-kernel, we also print out the stack and code at the
130 * time of the fault.. 133 * time of the fault..
131 */ 134 */
132 if (!user_mode_vm(regs)) { 135 if (!user_mode(regs)) {
133 unsigned int code_prologue = code_bytes * 43 / 64; 136 unsigned int code_prologue = code_bytes * 43 / 64;
134 unsigned int code_len = code_bytes; 137 unsigned int code_len = code_bytes;
135 unsigned char c; 138 unsigned char c;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index ff86f19b5758..5f1c6266eb30 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -280,12 +280,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
280 pr_cont(" <EOI> "); 280 pr_cont(" <EOI> ");
281 } 281 }
282 } else { 282 } else {
283 if (((long) stack & (THREAD_SIZE-1)) == 0) 283 if (kstack_end(stack))
284 break; 284 break;
285 } 285 }
286 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 286 if ((i % STACKSLOTS_PER_LINE) == 0) {
287 pr_cont("\n"); 287 if (i != 0)
288 pr_cont(" %016lx", *stack++); 288 pr_cont("\n");
289 printk("%s %016lx", log_lvl, *stack++);
290 } else
291 pr_cont(" %016lx", *stack++);
289 touch_nmi_watchdog(); 292 touch_nmi_watchdog();
290 } 293 }
291 preempt_enable(); 294 preempt_enable();
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201deee923..7d46bb260334 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
661 extmap = (struct e820entry *)(sdata->data); 661 extmap = (struct e820entry *)(sdata->data);
662 __append_e820_map(extmap, entries); 662 __append_e820_map(extmap, entries);
663 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 663 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
664 early_iounmap(sdata, data_len); 664 early_memunmap(sdata, data_len);
665 printk(KERN_INFO "e820: extended physical RAM map:\n"); 665 printk(KERN_INFO "e820: extended physical RAM map:\n");
666 e820_print_map("extended"); 666 e820_print_map("extended");
667} 667}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index a62536a1be88..49ff55ef9b26 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
95#define DLL 0 /* Divisor Latch Low */ 95#define DLL 0 /* Divisor Latch Low */
96#define DLH 1 /* Divisor latch High */ 96#define DLH 1 /* Divisor latch High */
97 97
98static void mem32_serial_out(unsigned long addr, int offset, int value)
99{
100 uint32_t *vaddr = (uint32_t *)addr;
101 /* shift implied by pointer type */
102 writel(value, vaddr + offset);
103}
104
105static unsigned int mem32_serial_in(unsigned long addr, int offset)
106{
107 uint32_t *vaddr = (uint32_t *)addr;
108 /* shift implied by pointer type */
109 return readl(vaddr + offset);
110}
111
112static unsigned int io_serial_in(unsigned long addr, int offset) 98static unsigned int io_serial_in(unsigned long addr, int offset)
113{ 99{
114 return inb(addr + offset); 100 return inb(addr + offset);
@@ -205,6 +191,20 @@ static __init void early_serial_init(char *s)
205} 191}
206 192
207#ifdef CONFIG_PCI 193#ifdef CONFIG_PCI
194static void mem32_serial_out(unsigned long addr, int offset, int value)
195{
196 u32 *vaddr = (u32 *)addr;
197 /* shift implied by pointer type */
198 writel(value, vaddr + offset);
199}
200
201static unsigned int mem32_serial_in(unsigned long addr, int offset)
202{
203 u32 *vaddr = (u32 *)addr;
204 /* shift implied by pointer type */
205 return readl(vaddr + offset);
206}
207
208/* 208/*
209 * early_pci_serial_init() 209 * early_pci_serial_init()
210 * 210 *
@@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s)
217 unsigned divisor; 217 unsigned divisor;
218 unsigned long baud = DEFAULT_BAUD; 218 unsigned long baud = DEFAULT_BAUD;
219 u8 bus, slot, func; 219 u8 bus, slot, func;
220 uint32_t classcode, bar0; 220 u32 classcode, bar0;
221 uint16_t cmdreg; 221 u16 cmdreg;
222 char *e; 222 char *e;
223 223
224 224
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5bf3e38..1c309763e321 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,10 +395,13 @@ sysenter_past_esp:
395 /*CFI_REL_OFFSET cs, 0*/ 395 /*CFI_REL_OFFSET cs, 0*/
396 /* 396 /*
397 * Push current_thread_info()->sysenter_return to the stack. 397 * Push current_thread_info()->sysenter_return to the stack.
398 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 398 * A tiny bit of offset fixup is necessary: TI_sysenter_return
399 * pushed above; +8 corresponds to copy_thread's esp0 setting. 399 * is relative to thread_info, which is at the bottom of the
400 * kernel stack page. 4*4 means the 4 words pushed above;
401 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
402 * and THREAD_SIZE takes us to the bottom.
400 */ 403 */
401 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) 404 pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
402 CFI_REL_OFFSET eip, 0 405 CFI_REL_OFFSET eip, 0
403 406
404 pushl_cfi %eax 407 pushl_cfi %eax
@@ -432,7 +435,7 @@ sysenter_after_call:
432 TRACE_IRQS_OFF 435 TRACE_IRQS_OFF
433 movl TI_flags(%ebp), %ecx 436 movl TI_flags(%ebp), %ecx
434 testl $_TIF_ALLWORK_MASK, %ecx 437 testl $_TIF_ALLWORK_MASK, %ecx
435 jne sysexit_audit 438 jnz sysexit_audit
436sysenter_exit: 439sysenter_exit:
437/* if something modifies registers it must also disable sysexit */ 440/* if something modifies registers it must also disable sysexit */
438 movl PT_EIP(%esp), %edx 441 movl PT_EIP(%esp), %edx
@@ -460,7 +463,7 @@ sysenter_audit:
460 463
461sysexit_audit: 464sysexit_audit:
462 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 465 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
463 jne syscall_exit_work 466 jnz syscall_exit_work
464 TRACE_IRQS_ON 467 TRACE_IRQS_ON
465 ENABLE_INTERRUPTS(CLBR_ANY) 468 ENABLE_INTERRUPTS(CLBR_ANY)
466 movl %eax,%edx /* second arg, syscall return value */ 469 movl %eax,%edx /* second arg, syscall return value */
@@ -472,7 +475,7 @@ sysexit_audit:
472 TRACE_IRQS_OFF 475 TRACE_IRQS_OFF
473 movl TI_flags(%ebp), %ecx 476 movl TI_flags(%ebp), %ecx
474 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 477 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
475 jne syscall_exit_work 478 jnz syscall_exit_work
476 movl PT_EAX(%esp),%eax /* reload syscall return value */ 479 movl PT_EAX(%esp),%eax /* reload syscall return value */
477 jmp sysenter_exit 480 jmp sysenter_exit
478#endif 481#endif
@@ -510,7 +513,7 @@ syscall_exit:
510 TRACE_IRQS_OFF 513 TRACE_IRQS_OFF
511 movl TI_flags(%ebp), %ecx 514 movl TI_flags(%ebp), %ecx
512 testl $_TIF_ALLWORK_MASK, %ecx # current->work 515 testl $_TIF_ALLWORK_MASK, %ecx # current->work
513 jne syscall_exit_work 516 jnz syscall_exit_work
514 517
515restore_all: 518restore_all:
516 TRACE_IRQS_IRET 519 TRACE_IRQS_IRET
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and
612#ifdef CONFIG_VM86 615#ifdef CONFIG_VM86
613 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 616 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
614 movl %esp, %eax 617 movl %esp, %eax
615 jne work_notifysig_v86 # returning to kernel-space or 618 jnz work_notifysig_v86 # returning to kernel-space or
616 # vm86-space 619 # vm86-space
6171: 6201:
618#else 621#else
@@ -720,43 +723,22 @@ END(sysenter_badsys)
720.endm 723.endm
721 724
722/* 725/*
723 * Build the entry stubs and pointer table with some assembler magic. 726 * Build the entry stubs with some assembler magic.
724 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 727 * We pack 1 stub into every 8-byte block.
725 * single cache line on all modern x86 implementations.
726 */ 728 */
727.section .init.rodata,"a" 729 .align 8
728ENTRY(interrupt)
729.section .entry.text, "ax"
730 .p2align 5
731 .p2align CONFIG_X86_L1_CACHE_SHIFT
732ENTRY(irq_entries_start) 730ENTRY(irq_entries_start)
733 RING0_INT_FRAME 731 RING0_INT_FRAME
734vector=FIRST_EXTERNAL_VECTOR 732 vector=FIRST_EXTERNAL_VECTOR
735.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 733 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
736 .balign 32 734 pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
737 .rept 7 735 vector=vector+1
738 .if vector < FIRST_SYSTEM_VECTOR 736 jmp common_interrupt
739 .if vector <> FIRST_EXTERNAL_VECTOR
740 CFI_ADJUST_CFA_OFFSET -4 737 CFI_ADJUST_CFA_OFFSET -4
741 .endif 738 .align 8
7421: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ 739 .endr
743 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
744 jmp 2f
745 .endif
746 .previous
747 .long 1b
748 .section .entry.text, "ax"
749vector=vector+1
750 .endif
751 .endr
7522: jmp common_interrupt
753.endr
754END(irq_entries_start) 740END(irq_entries_start)
755 741
756.previous
757END(interrupt)
758.previous
759
760/* 742/*
761 * the CPU automatically disables interrupts when executing an IRQ vector, 743 * the CPU automatically disables interrupts when executing an IRQ vector,
762 * so IRQ-flags tracing has to follow that: 744 * so IRQ-flags tracing has to follow that:
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error)
816 pushl_cfi $0 798 pushl_cfi $0
817#ifdef CONFIG_X86_INVD_BUG 799#ifdef CONFIG_X86_INVD_BUG
818 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 800 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
819661: pushl_cfi $do_general_protection 801 ALTERNATIVE "pushl_cfi $do_general_protection", \
820662: 802 "pushl $do_simd_coprocessor_error", \
821.section .altinstructions,"a" 803 X86_FEATURE_XMM
822 altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
823.previous
824.section .altinstr_replacement,"ax"
825663: pushl $do_simd_coprocessor_error
826664:
827.previous
828#else 804#else
829 pushl_cfi $do_simd_coprocessor_error 805 pushl_cfi $do_simd_coprocessor_error
830#endif 806#endif
@@ -1240,20 +1216,13 @@ error_code:
1240 /*CFI_REL_OFFSET es, 0*/ 1216 /*CFI_REL_OFFSET es, 0*/
1241 pushl_cfi %ds 1217 pushl_cfi %ds
1242 /*CFI_REL_OFFSET ds, 0*/ 1218 /*CFI_REL_OFFSET ds, 0*/
1243 pushl_cfi %eax 1219 pushl_cfi_reg eax
1244 CFI_REL_OFFSET eax, 0 1220 pushl_cfi_reg ebp
1245 pushl_cfi %ebp 1221 pushl_cfi_reg edi
1246 CFI_REL_OFFSET ebp, 0 1222 pushl_cfi_reg esi
1247 pushl_cfi %edi 1223 pushl_cfi_reg edx
1248 CFI_REL_OFFSET edi, 0 1224 pushl_cfi_reg ecx
1249 pushl_cfi %esi 1225 pushl_cfi_reg ebx
1250 CFI_REL_OFFSET esi, 0
1251 pushl_cfi %edx
1252 CFI_REL_OFFSET edx, 0
1253 pushl_cfi %ecx
1254 CFI_REL_OFFSET ecx, 0
1255 pushl_cfi %ebx
1256 CFI_REL_OFFSET ebx, 0
1257 cld 1226 cld
1258 movl $(__KERNEL_PERCPU), %ecx 1227 movl $(__KERNEL_PERCPU), %ecx
1259 movl %ecx, %fs 1228 movl %ecx, %fs
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2babb393915e..c7b238494b31 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,27 +14,14 @@
14 * NOTE: This code handles signal-recognition, which happens every time 14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call. 15 * after an interrupt and after each system call.
16 * 16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology: 17 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP 18 * - iret frame: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack. 19 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers up to R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 * 20 *
26 * Some macro usage: 21 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better 22 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code. 23 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table. 24 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. 25 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - idtentry - Define exception entry points. 26 * - idtentry - Define exception entry points.
40 */ 27 */
@@ -70,10 +57,6 @@
70 .section .entry.text, "ax" 57 .section .entry.text, "ax"
71 58
72 59
73#ifndef CONFIG_PREEMPT
74#define retint_kernel retint_restore_args
75#endif
76
77#ifdef CONFIG_PARAVIRT 60#ifdef CONFIG_PARAVIRT
78ENTRY(native_usergs_sysret64) 61ENTRY(native_usergs_sysret64)
79 swapgs 62 swapgs
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64)
82#endif /* CONFIG_PARAVIRT */ 65#endif /* CONFIG_PARAVIRT */
83 66
84 67
85.macro TRACE_IRQS_IRETQ offset=ARGOFFSET 68.macro TRACE_IRQS_IRETQ
86#ifdef CONFIG_TRACE_IRQFLAGS 69#ifdef CONFIG_TRACE_IRQFLAGS
87 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 70 bt $9,EFLAGS(%rsp) /* interrupts off? */
88 jnc 1f 71 jnc 1f
89 TRACE_IRQS_ON 72 TRACE_IRQS_ON
901: 731:
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64)
116 call debug_stack_reset 99 call debug_stack_reset
117.endm 100.endm
118 101
119.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET 102.macro TRACE_IRQS_IRETQ_DEBUG
120 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 103 bt $9,EFLAGS(%rsp) /* interrupts off? */
121 jnc 1f 104 jnc 1f
122 TRACE_IRQS_ON_DEBUG 105 TRACE_IRQS_ON_DEBUG
1231: 1061:
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64)
130#endif 113#endif
131 114
132/* 115/*
133 * C code is not supposed to know about undefined top of stack. Every time 116 * empty frame
134 * a C function with an pt_regs argument is called from the SYSCALL based
135 * fast path FIXUP_TOP_OF_STACK is needed.
136 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
137 * manipulation.
138 */
139
140 /* %rsp:at FRAMEEND */
141 .macro FIXUP_TOP_OF_STACK tmp offset=0
142 movq PER_CPU_VAR(old_rsp),\tmp
143 movq \tmp,RSP+\offset(%rsp)
144 movq $__USER_DS,SS+\offset(%rsp)
145 movq $__USER_CS,CS+\offset(%rsp)
146 movq RIP+\offset(%rsp),\tmp /* get rip */
147 movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
148 movq R11+\offset(%rsp),\tmp /* get eflags */
149 movq \tmp,EFLAGS+\offset(%rsp)
150 .endm
151
152 .macro RESTORE_TOP_OF_STACK tmp offset=0
153 movq RSP+\offset(%rsp),\tmp
154 movq \tmp,PER_CPU_VAR(old_rsp)
155 movq EFLAGS+\offset(%rsp),\tmp
156 movq \tmp,R11+\offset(%rsp)
157 .endm
158
159/*
160 * initial frame state for interrupts (and exceptions without error code)
161 */ 117 */
162 .macro EMPTY_FRAME start=1 offset=0 118 .macro EMPTY_FRAME start=1 offset=0
163 .if \start 119 .if \start
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64)
173 * initial frame state for interrupts (and exceptions without error code) 129 * initial frame state for interrupts (and exceptions without error code)
174 */ 130 */
175 .macro INTR_FRAME start=1 offset=0 131 .macro INTR_FRAME start=1 offset=0
176 EMPTY_FRAME \start, SS+8+\offset-RIP 132 EMPTY_FRAME \start, 5*8+\offset
177 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ 133 /*CFI_REL_OFFSET ss, 4*8+\offset*/
178 CFI_REL_OFFSET rsp, RSP+\offset-RIP 134 CFI_REL_OFFSET rsp, 3*8+\offset
179 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ 135 /*CFI_REL_OFFSET rflags, 2*8+\offset*/
180 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ 136 /*CFI_REL_OFFSET cs, 1*8+\offset*/
181 CFI_REL_OFFSET rip, RIP+\offset-RIP 137 CFI_REL_OFFSET rip, 0*8+\offset
182 .endm 138 .endm
183 139
184/* 140/*
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64)
186 * with vector already pushed) 142 * with vector already pushed)
187 */ 143 */
188 .macro XCPT_FRAME start=1 offset=0 144 .macro XCPT_FRAME start=1 offset=0
189 INTR_FRAME \start, RIP+\offset-ORIG_RAX 145 INTR_FRAME \start, 1*8+\offset
190 .endm
191
192/*
193 * frame that enables calling into C.
194 */
195 .macro PARTIAL_FRAME start=1 offset=0
196 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
197 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
198 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
199 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
200 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
201 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
202 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
203 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
204 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
205 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
206 .endm 146 .endm
207 147
208/* 148/*
209 * frame that enables passing a complete pt_regs to a C function. 149 * frame that enables passing a complete pt_regs to a C function.
210 */ 150 */
211 .macro DEFAULT_FRAME start=1 offset=0 151 .macro DEFAULT_FRAME start=1 offset=0
212 PARTIAL_FRAME \start, R11+\offset-R15 152 XCPT_FRAME \start, ORIG_RAX+\offset
153 CFI_REL_OFFSET rdi, RDI+\offset
154 CFI_REL_OFFSET rsi, RSI+\offset
155 CFI_REL_OFFSET rdx, RDX+\offset
156 CFI_REL_OFFSET rcx, RCX+\offset
157 CFI_REL_OFFSET rax, RAX+\offset
158 CFI_REL_OFFSET r8, R8+\offset
159 CFI_REL_OFFSET r9, R9+\offset
160 CFI_REL_OFFSET r10, R10+\offset
161 CFI_REL_OFFSET r11, R11+\offset
213 CFI_REL_OFFSET rbx, RBX+\offset 162 CFI_REL_OFFSET rbx, RBX+\offset
214 CFI_REL_OFFSET rbp, RBP+\offset 163 CFI_REL_OFFSET rbp, RBP+\offset
215 CFI_REL_OFFSET r12, R12+\offset 164 CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64)
218 CFI_REL_OFFSET r15, R15+\offset 167 CFI_REL_OFFSET r15, R15+\offset
219 .endm 168 .endm
220 169
221ENTRY(save_paranoid)
222 XCPT_FRAME 1 RDI+8
223 cld
224 movq %rdi, RDI+8(%rsp)
225 movq %rsi, RSI+8(%rsp)
226 movq_cfi rdx, RDX+8
227 movq_cfi rcx, RCX+8
228 movq_cfi rax, RAX+8
229 movq %r8, R8+8(%rsp)
230 movq %r9, R9+8(%rsp)
231 movq %r10, R10+8(%rsp)
232 movq %r11, R11+8(%rsp)
233 movq_cfi rbx, RBX+8
234 movq %rbp, RBP+8(%rsp)
235 movq %r12, R12+8(%rsp)
236 movq %r13, R13+8(%rsp)
237 movq %r14, R14+8(%rsp)
238 movq %r15, R15+8(%rsp)
239 movl $1,%ebx
240 movl $MSR_GS_BASE,%ecx
241 rdmsr
242 testl %edx,%edx
243 js 1f /* negative -> in kernel */
244 SWAPGS
245 xorl %ebx,%ebx
2461: ret
247 CFI_ENDPROC
248END(save_paranoid)
249
250/* 170/*
251 * A newly forked process directly context switches into this address. 171 * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
252 * 172 *
253 * rdi: prev task we switched from 173 * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
254 */ 174 * then loads new ss, cs, and rip from previously programmed MSRs.
255ENTRY(ret_from_fork) 175 * rflags gets masked by a value from another MSR (so CLD and CLAC
256 DEFAULT_FRAME 176 * are not needed). SYSCALL does not save anything on the stack
257 177 * and does not change rsp.
258 LOCK ; btr $TIF_FORK,TI_flags(%r8)
259
260 pushq_cfi $0x0002
261 popfq_cfi # reset kernel eflags
262
263 call schedule_tail # rdi: 'prev' task parameter
264
265 GET_THREAD_INFO(%rcx)
266
267 RESTORE_REST
268
269 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
270 jz 1f
271
272 /*
273 * By the time we get here, we have no idea whether our pt_regs,
274 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
275 * the slow path, or one of the ia32entry paths.
276 * Use int_ret_from_sys_call to return, since it can safely handle
277 * all of the above.
278 */
279 jmp int_ret_from_sys_call
280
2811:
282 subq $REST_SKIP, %rsp # leave space for volatiles
283 CFI_ADJUST_CFA_OFFSET REST_SKIP
284 movq %rbp, %rdi
285 call *%rbx
286 movl $0, RAX(%rsp)
287 RESTORE_REST
288 jmp int_ret_from_sys_call
289 CFI_ENDPROC
290END(ret_from_fork)
291
292/*
293 * System call entry. Up to 6 arguments in registers are supported.
294 * 178 *
295 * SYSCALL does not save anything on the stack and does not change the 179 * Registers on entry:
296 * stack pointer. However, it does mask the flags register for us, so
297 * CLD and CLAC are not needed.
298 */
299
300/*
301 * Register setup:
302 * rax system call number 180 * rax system call number
181 * rcx return address
182 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
303 * rdi arg0 183 * rdi arg0
304 * rcx return address for syscall/sysret, C arg3
305 * rsi arg1 184 * rsi arg1
306 * rdx arg2 185 * rdx arg2
307 * r10 arg3 (--> moved to rcx for C) 186 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
308 * r8 arg4 187 * r8 arg4
309 * r9 arg5 188 * r9 arg5
310 * r11 eflags for syscall/sysret, temporary for C 189 * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
311 * r12-r15,rbp,rbx saved by C code, not touched.
312 * 190 *
313 * Interrupts are off on entry.
314 * Only called from user space. 191 * Only called from user space.
315 * 192 *
316 * XXX if we had a free scratch register we could save the RSP into the stack frame 193 * When user can change pt_regs->foo always force IRET. That is because
317 * and report it properly in ps. Unfortunately we haven't.
318 *
319 * When user can change the frames always force IRET. That is because
320 * it deals with uncanonical addresses better. SYSRET has trouble 194 * it deals with uncanonical addresses better. SYSRET has trouble
321 * with them due to bugs in both AMD and Intel CPUs. 195 * with them due to bugs in both AMD and Intel CPUs.
322 */ 196 */
@@ -324,9 +198,15 @@ END(ret_from_fork)
324ENTRY(system_call) 198ENTRY(system_call)
325 CFI_STARTPROC simple 199 CFI_STARTPROC simple
326 CFI_SIGNAL_FRAME 200 CFI_SIGNAL_FRAME
327 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 201 CFI_DEF_CFA rsp,0
328 CFI_REGISTER rip,rcx 202 CFI_REGISTER rip,rcx
329 /*CFI_REGISTER rflags,r11*/ 203 /*CFI_REGISTER rflags,r11*/
204
205 /*
206 * Interrupts are off on entry.
207 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
208 * it is too small to ever cause noticeable irq latency.
209 */
330 SWAPGS_UNSAFE_STACK 210 SWAPGS_UNSAFE_STACK
331 /* 211 /*
332 * A hypervisor implementation might want to use a label 212 * A hypervisor implementation might want to use a label
@@ -335,18 +215,38 @@ ENTRY(system_call)
335 */ 215 */
336GLOBAL(system_call_after_swapgs) 216GLOBAL(system_call_after_swapgs)
337 217
338 movq %rsp,PER_CPU_VAR(old_rsp) 218 movq %rsp,PER_CPU_VAR(rsp_scratch)
339 movq PER_CPU_VAR(kernel_stack),%rsp 219 movq PER_CPU_VAR(kernel_stack),%rsp
220
221 /* Construct struct pt_regs on stack */
222 pushq_cfi $__USER_DS /* pt_regs->ss */
223 pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
340 /* 224 /*
341 * No need to follow this irqs off/on section - it's straight 225 * Re-enable interrupts.
342 * and short: 226 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
227 * must execute atomically in the face of possible interrupt-driven
228 * task preemption. We must enable interrupts only after we're done
229 * with using rsp_scratch:
343 */ 230 */
344 ENABLE_INTERRUPTS(CLBR_NONE) 231 ENABLE_INTERRUPTS(CLBR_NONE)
345 SAVE_ARGS 8, 0, rax_enosys=1 232 pushq_cfi %r11 /* pt_regs->flags */
346 movq_cfi rax,(ORIG_RAX-ARGOFFSET) 233 pushq_cfi $__USER_CS /* pt_regs->cs */
347 movq %rcx,RIP-ARGOFFSET(%rsp) 234 pushq_cfi %rcx /* pt_regs->ip */
348 CFI_REL_OFFSET rip,RIP-ARGOFFSET 235 CFI_REL_OFFSET rip,0
349 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 236 pushq_cfi_reg rax /* pt_regs->orig_ax */
237 pushq_cfi_reg rdi /* pt_regs->di */
238 pushq_cfi_reg rsi /* pt_regs->si */
239 pushq_cfi_reg rdx /* pt_regs->dx */
240 pushq_cfi_reg rcx /* pt_regs->cx */
241 pushq_cfi $-ENOSYS /* pt_regs->ax */
242 pushq_cfi_reg r8 /* pt_regs->r8 */
243 pushq_cfi_reg r9 /* pt_regs->r9 */
244 pushq_cfi_reg r10 /* pt_regs->r10 */
245 pushq_cfi_reg r11 /* pt_regs->r11 */
246 sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
247 CFI_ADJUST_CFA_OFFSET 6*8
248
249 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
350 jnz tracesys 250 jnz tracesys
351system_call_fastpath: 251system_call_fastpath:
352#if __SYSCALL_MASK == ~0 252#if __SYSCALL_MASK == ~0
@@ -355,18 +255,21 @@ system_call_fastpath:
355 andl $__SYSCALL_MASK,%eax 255 andl $__SYSCALL_MASK,%eax
356 cmpl $__NR_syscall_max,%eax 256 cmpl $__NR_syscall_max,%eax
357#endif 257#endif
358 ja ret_from_sys_call /* and return regs->ax */ 258 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
359 movq %r10,%rcx 259 movq %r10,%rcx
360 call *sys_call_table(,%rax,8) # XXX: rip relative 260 call *sys_call_table(,%rax,8)
361 movq %rax,RAX-ARGOFFSET(%rsp) 261 movq %rax,RAX(%rsp)
2621:
362/* 263/*
363 * Syscall return path ending with SYSRET (fast path) 264 * Syscall return path ending with SYSRET (fast path).
364 * Has incomplete stack frame and undefined top of stack. 265 * Has incompletely filled pt_regs.
365 */ 266 */
366ret_from_sys_call:
367 LOCKDEP_SYS_EXIT 267 LOCKDEP_SYS_EXIT
268 /*
269 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
270 * it is too small to ever cause noticeable irq latency.
271 */
368 DISABLE_INTERRUPTS(CLBR_NONE) 272 DISABLE_INTERRUPTS(CLBR_NONE)
369 TRACE_IRQS_OFF
370 273
371 /* 274 /*
372 * We must check ti flags with interrupts (or at least preemption) 275 * We must check ti flags with interrupts (or at least preemption)
@@ -376,72 +279,73 @@ ret_from_sys_call:
376 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is 279 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
377 * very bad. 280 * very bad.
378 */ 281 */
379 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 282 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
380 jnz int_ret_from_sys_call_fixup /* Go the the slow path */ 283 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
381 284
382 CFI_REMEMBER_STATE 285 CFI_REMEMBER_STATE
383 /* 286
384 * sysretq will re-enable interrupts: 287 RESTORE_C_REGS_EXCEPT_RCX_R11
385 */ 288 movq RIP(%rsp),%rcx
386 TRACE_IRQS_ON
387 movq RIP-ARGOFFSET(%rsp),%rcx
388 CFI_REGISTER rip,rcx 289 CFI_REGISTER rip,rcx
389 RESTORE_ARGS 1,-ARG_SKIP,0 290 movq EFLAGS(%rsp),%r11
390 /*CFI_REGISTER rflags,r11*/ 291 /*CFI_REGISTER rflags,r11*/
391 movq PER_CPU_VAR(old_rsp), %rsp 292 movq RSP(%rsp),%rsp
293 /*
294 * 64bit SYSRET restores rip from rcx,
295 * rflags from r11 (but RF and VM bits are forced to 0),
296 * cs and ss are loaded from MSRs.
297 * Restoration of rflags re-enables interrupts.
298 */
392 USERGS_SYSRET64 299 USERGS_SYSRET64
393 300
394 CFI_RESTORE_STATE 301 CFI_RESTORE_STATE
395 302
396int_ret_from_sys_call_fixup: 303 /* Do syscall entry tracing */
397 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
398 jmp int_ret_from_sys_call_irqs_off
399
400 /* Do syscall tracing */
401tracesys: 304tracesys:
402 leaq -REST_SKIP(%rsp), %rdi 305 movq %rsp, %rdi
403 movq $AUDIT_ARCH_X86_64, %rsi 306 movl $AUDIT_ARCH_X86_64, %esi
404 call syscall_trace_enter_phase1 307 call syscall_trace_enter_phase1
405 test %rax, %rax 308 test %rax, %rax
406 jnz tracesys_phase2 /* if needed, run the slow path */ 309 jnz tracesys_phase2 /* if needed, run the slow path */
407 LOAD_ARGS 0 /* else restore clobbered regs */ 310 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
311 movq ORIG_RAX(%rsp), %rax
408 jmp system_call_fastpath /* and return to the fast path */ 312 jmp system_call_fastpath /* and return to the fast path */
409 313
410tracesys_phase2: 314tracesys_phase2:
411 SAVE_REST 315 SAVE_EXTRA_REGS
412 FIXUP_TOP_OF_STACK %rdi
413 movq %rsp, %rdi 316 movq %rsp, %rdi
414 movq $AUDIT_ARCH_X86_64, %rsi 317 movl $AUDIT_ARCH_X86_64, %esi
415 movq %rax,%rdx 318 movq %rax,%rdx
416 call syscall_trace_enter_phase2 319 call syscall_trace_enter_phase2
417 320
418 /* 321 /*
419 * Reload arg registers from stack in case ptrace changed them. 322 * Reload registers from stack in case ptrace changed them.
420 * We don't reload %rax because syscall_trace_entry_phase2() returned 323 * We don't reload %rax because syscall_trace_entry_phase2() returned
421 * the value it wants us to use in the table lookup. 324 * the value it wants us to use in the table lookup.
422 */ 325 */
423 LOAD_ARGS ARGOFFSET, 1 326 RESTORE_C_REGS_EXCEPT_RAX
424 RESTORE_REST 327 RESTORE_EXTRA_REGS
425#if __SYSCALL_MASK == ~0 328#if __SYSCALL_MASK == ~0
426 cmpq $__NR_syscall_max,%rax 329 cmpq $__NR_syscall_max,%rax
427#else 330#else
428 andl $__SYSCALL_MASK,%eax 331 andl $__SYSCALL_MASK,%eax
429 cmpl $__NR_syscall_max,%eax 332 cmpl $__NR_syscall_max,%eax
430#endif 333#endif
431 ja int_ret_from_sys_call /* RAX(%rsp) is already set */ 334 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
432 movq %r10,%rcx /* fixup for C */ 335 movq %r10,%rcx /* fixup for C */
433 call *sys_call_table(,%rax,8) 336 call *sys_call_table(,%rax,8)
434 movq %rax,RAX-ARGOFFSET(%rsp) 337 movq %rax,RAX(%rsp)
435 /* Use IRET because user could have changed frame */ 3381:
339 /* Use IRET because user could have changed pt_regs->foo */
436 340
437/* 341/*
438 * Syscall return path ending with IRET. 342 * Syscall return path ending with IRET.
439 * Has correct top of stack, but partial stack frame. 343 * Has correct iret frame.
440 */ 344 */
441GLOBAL(int_ret_from_sys_call) 345GLOBAL(int_ret_from_sys_call)
442 DISABLE_INTERRUPTS(CLBR_NONE) 346 DISABLE_INTERRUPTS(CLBR_NONE)
347int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
443 TRACE_IRQS_OFF 348 TRACE_IRQS_OFF
444int_ret_from_sys_call_irqs_off:
445 movl $_TIF_ALLWORK_MASK,%edi 349 movl $_TIF_ALLWORK_MASK,%edi
446 /* edi: mask to check */ 350 /* edi: mask to check */
447GLOBAL(int_with_check) 351GLOBAL(int_with_check)
@@ -450,8 +354,8 @@ GLOBAL(int_with_check)
450 movl TI_flags(%rcx),%edx 354 movl TI_flags(%rcx),%edx
451 andl %edi,%edx 355 andl %edi,%edx
452 jnz int_careful 356 jnz int_careful
453 andl $~TS_COMPAT,TI_status(%rcx) 357 andl $~TS_COMPAT,TI_status(%rcx)
454 jmp retint_swapgs 358 jmp syscall_return
455 359
456 /* Either reschedule or signal or syscall exit tracking needed. */ 360 /* Either reschedule or signal or syscall exit tracking needed. */
457 /* First do a reschedule test. */ 361 /* First do a reschedule test. */
@@ -468,12 +372,11 @@ int_careful:
468 TRACE_IRQS_OFF 372 TRACE_IRQS_OFF
469 jmp int_with_check 373 jmp int_with_check
470 374
471 /* handle signals and tracing -- both require a full stack frame */ 375 /* handle signals and tracing -- both require a full pt_regs */
472int_very_careful: 376int_very_careful:
473 TRACE_IRQS_ON 377 TRACE_IRQS_ON
474 ENABLE_INTERRUPTS(CLBR_NONE) 378 ENABLE_INTERRUPTS(CLBR_NONE)
475int_check_syscall_exit_work: 379 SAVE_EXTRA_REGS
476 SAVE_REST
477 /* Check for syscall exit trace */ 380 /* Check for syscall exit trace */
478 testl $_TIF_WORK_SYSCALL_EXIT,%edx 381 testl $_TIF_WORK_SYSCALL_EXIT,%edx
479 jz int_signal 382 jz int_signal
@@ -492,86 +395,192 @@ int_signal:
492 call do_notify_resume 395 call do_notify_resume
4931: movl $_TIF_WORK_MASK,%edi 3961: movl $_TIF_WORK_MASK,%edi
494int_restore_rest: 397int_restore_rest:
495 RESTORE_REST 398 RESTORE_EXTRA_REGS
496 DISABLE_INTERRUPTS(CLBR_NONE) 399 DISABLE_INTERRUPTS(CLBR_NONE)
497 TRACE_IRQS_OFF 400 TRACE_IRQS_OFF
498 jmp int_with_check 401 jmp int_with_check
402
403syscall_return:
404 /* The IRETQ could re-enable interrupts: */
405 DISABLE_INTERRUPTS(CLBR_ANY)
406 TRACE_IRQS_IRETQ
407
408 /*
409 * Try to use SYSRET instead of IRET if we're returning to
410 * a completely clean 64-bit userspace context.
411 */
412 movq RCX(%rsp),%rcx
413 cmpq %rcx,RIP(%rsp) /* RCX == RIP */
414 jne opportunistic_sysret_failed
415
416 /*
417 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
418 * in kernel space. This essentially lets the user take over
419 * the kernel, since userspace controls RSP. It's not worth
420 * testing for canonicalness exactly -- this check detects any
421 * of the 17 high bits set, which is true for non-canonical
422 * or kernel addresses. (This will pessimize vsyscall=native.
423 * Big deal.)
424 *
425 * If virtual addresses ever become wider, this will need
426 * to be updated to remain correct on both old and new CPUs.
427 */
428 .ifne __VIRTUAL_MASK_SHIFT - 47
429 .error "virtual address width changed -- SYSRET checks need update"
430 .endif
431 shr $__VIRTUAL_MASK_SHIFT, %rcx
432 jnz opportunistic_sysret_failed
433
434 cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
435 jne opportunistic_sysret_failed
436
437 movq R11(%rsp),%r11
438 cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
439 jne opportunistic_sysret_failed
440
441 /*
442 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
443 * restoring TF results in a trap from userspace immediately after
444 * SYSRET. This would cause an infinite loop whenever #DB happens
445 * with register state that satisfies the opportunistic SYSRET
446 * conditions. For example, single-stepping this user code:
447 *
448 * movq $stuck_here,%rcx
449 * pushfq
450 * popq %r11
451 * stuck_here:
452 *
453 * would never get past 'stuck_here'.
454 */
455 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
456 jnz opportunistic_sysret_failed
457
458 /* nothing to check for RSP */
459
460 cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
461 jne opportunistic_sysret_failed
462
463 /*
464 * We win! This label is here just for ease of understanding
465 * perf profiles. Nothing jumps here.
466 */
467syscall_return_via_sysret:
468 CFI_REMEMBER_STATE
469 /* r11 is already restored (see code above) */
470 RESTORE_C_REGS_EXCEPT_R11
471 movq RSP(%rsp),%rsp
472 USERGS_SYSRET64
473 CFI_RESTORE_STATE
474
475opportunistic_sysret_failed:
476 SWAPGS
477 jmp restore_c_regs_and_iret
499 CFI_ENDPROC 478 CFI_ENDPROC
500END(system_call) 479END(system_call)
501 480
481
502 .macro FORK_LIKE func 482 .macro FORK_LIKE func
503ENTRY(stub_\func) 483ENTRY(stub_\func)
504 CFI_STARTPROC 484 CFI_STARTPROC
505 popq %r11 /* save return address */ 485 DEFAULT_FRAME 0, 8 /* offset 8: return address */
506 PARTIAL_FRAME 0 486 SAVE_EXTRA_REGS 8
507 SAVE_REST 487 jmp sys_\func
508 pushq %r11 /* put it back on stack */
509 FIXUP_TOP_OF_STACK %r11, 8
510 DEFAULT_FRAME 0 8 /* offset 8: return address */
511 call sys_\func
512 RESTORE_TOP_OF_STACK %r11, 8
513 ret $REST_SKIP /* pop extended registers */
514 CFI_ENDPROC 488 CFI_ENDPROC
515END(stub_\func) 489END(stub_\func)
516 .endm 490 .endm
517 491
518 .macro FIXED_FRAME label,func
519ENTRY(\label)
520 CFI_STARTPROC
521 PARTIAL_FRAME 0 8 /* offset 8: return address */
522 FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
523 call \func
524 RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
525 ret
526 CFI_ENDPROC
527END(\label)
528 .endm
529
530 FORK_LIKE clone 492 FORK_LIKE clone
531 FORK_LIKE fork 493 FORK_LIKE fork
532 FORK_LIKE vfork 494 FORK_LIKE vfork
533 FIXED_FRAME stub_iopl, sys_iopl
534 495
535ENTRY(stub_execve) 496ENTRY(stub_execve)
536 CFI_STARTPROC 497 CFI_STARTPROC
537 addq $8, %rsp 498 DEFAULT_FRAME 0, 8
538 PARTIAL_FRAME 0 499 call sys_execve
539 SAVE_REST 500return_from_execve:
540 FIXUP_TOP_OF_STACK %r11 501 testl %eax, %eax
541 call sys_execve 502 jz 1f
542 movq %rax,RAX(%rsp) 503 /* exec failed, can use fast SYSRET code path in this case */
543 RESTORE_REST 504 ret
544 jmp int_ret_from_sys_call 5051:
506 /* must use IRET code path (pt_regs->cs may have changed) */
507 addq $8, %rsp
508 CFI_ADJUST_CFA_OFFSET -8
509 ZERO_EXTRA_REGS
510 movq %rax,RAX(%rsp)
511 jmp int_ret_from_sys_call
545 CFI_ENDPROC 512 CFI_ENDPROC
546END(stub_execve) 513END(stub_execve)
547 514/*
548ENTRY(stub_execveat) 515 * Remaining execve stubs are only 7 bytes long.
516 * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
517 */
518 .align 8
519GLOBAL(stub_execveat)
549 CFI_STARTPROC 520 CFI_STARTPROC
550 addq $8, %rsp 521 DEFAULT_FRAME 0, 8
551 PARTIAL_FRAME 0 522 call sys_execveat
552 SAVE_REST 523 jmp return_from_execve
553 FIXUP_TOP_OF_STACK %r11
554 call sys_execveat
555 RESTORE_TOP_OF_STACK %r11
556 movq %rax,RAX(%rsp)
557 RESTORE_REST
558 jmp int_ret_from_sys_call
559 CFI_ENDPROC 524 CFI_ENDPROC
560END(stub_execveat) 525END(stub_execveat)
561 526
527#ifdef CONFIG_X86_X32_ABI
528 .align 8
529GLOBAL(stub_x32_execve)
530 CFI_STARTPROC
531 DEFAULT_FRAME 0, 8
532 call compat_sys_execve
533 jmp return_from_execve
534 CFI_ENDPROC
535END(stub_x32_execve)
536 .align 8
537GLOBAL(stub_x32_execveat)
538 CFI_STARTPROC
539 DEFAULT_FRAME 0, 8
540 call compat_sys_execveat
541 jmp return_from_execve
542 CFI_ENDPROC
543END(stub_x32_execveat)
544#endif
545
546#ifdef CONFIG_IA32_EMULATION
547 .align 8
548GLOBAL(stub32_execve)
549 CFI_STARTPROC
550 call compat_sys_execve
551 jmp return_from_execve
552 CFI_ENDPROC
553END(stub32_execve)
554 .align 8
555GLOBAL(stub32_execveat)
556 CFI_STARTPROC
557 call compat_sys_execveat
558 jmp return_from_execve
559 CFI_ENDPROC
560END(stub32_execveat)
561#endif
562
562/* 563/*
563 * sigreturn is special because it needs to restore all registers on return. 564 * sigreturn is special because it needs to restore all registers on return.
564 * This cannot be done with SYSRET, so use the IRET return path instead. 565 * This cannot be done with SYSRET, so use the IRET return path instead.
565 */ 566 */
566ENTRY(stub_rt_sigreturn) 567ENTRY(stub_rt_sigreturn)
567 CFI_STARTPROC 568 CFI_STARTPROC
568 addq $8, %rsp 569 DEFAULT_FRAME 0, 8
569 PARTIAL_FRAME 0 570 /*
570 SAVE_REST 571 * SAVE_EXTRA_REGS result is not normally needed:
571 FIXUP_TOP_OF_STACK %r11 572 * sigreturn overwrites all pt_regs->GPREGS.
573 * But sigreturn can fail (!), and there is no easy way to detect that.
574 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
575 * we SAVE_EXTRA_REGS here.
576 */
577 SAVE_EXTRA_REGS 8
572 call sys_rt_sigreturn 578 call sys_rt_sigreturn
573 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 579return_from_stub:
574 RESTORE_REST 580 addq $8, %rsp
581 CFI_ADJUST_CFA_OFFSET -8
582 RESTORE_EXTRA_REGS
583 movq %rax,RAX(%rsp)
575 jmp int_ret_from_sys_call 584 jmp int_ret_from_sys_call
576 CFI_ENDPROC 585 CFI_ENDPROC
577END(stub_rt_sigreturn) 586END(stub_rt_sigreturn)
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn)
579#ifdef CONFIG_X86_X32_ABI 588#ifdef CONFIG_X86_X32_ABI
580ENTRY(stub_x32_rt_sigreturn) 589ENTRY(stub_x32_rt_sigreturn)
581 CFI_STARTPROC 590 CFI_STARTPROC
582 addq $8, %rsp 591 DEFAULT_FRAME 0, 8
583 PARTIAL_FRAME 0 592 SAVE_EXTRA_REGS 8
584 SAVE_REST
585 FIXUP_TOP_OF_STACK %r11
586 call sys32_x32_rt_sigreturn 593 call sys32_x32_rt_sigreturn
587 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 594 jmp return_from_stub
588 RESTORE_REST
589 jmp int_ret_from_sys_call
590 CFI_ENDPROC 595 CFI_ENDPROC
591END(stub_x32_rt_sigreturn) 596END(stub_x32_rt_sigreturn)
597#endif
592 598
593ENTRY(stub_x32_execve) 599/*
594 CFI_STARTPROC 600 * A newly forked process directly context switches into this address.
595 addq $8, %rsp 601 *
596 PARTIAL_FRAME 0 602 * rdi: prev task we switched from
597 SAVE_REST 603 */
598 FIXUP_TOP_OF_STACK %r11 604ENTRY(ret_from_fork)
599 call compat_sys_execve 605 DEFAULT_FRAME
600 RESTORE_TOP_OF_STACK %r11
601 movq %rax,RAX(%rsp)
602 RESTORE_REST
603 jmp int_ret_from_sys_call
604 CFI_ENDPROC
605END(stub_x32_execve)
606 606
607ENTRY(stub_x32_execveat) 607 LOCK ; btr $TIF_FORK,TI_flags(%r8)
608 CFI_STARTPROC 608
609 addq $8, %rsp 609 pushq_cfi $0x0002
610 PARTIAL_FRAME 0 610 popfq_cfi # reset kernel eflags
611 SAVE_REST 611
612 FIXUP_TOP_OF_STACK %r11 612 call schedule_tail # rdi: 'prev' task parameter
613 call compat_sys_execveat 613
614 RESTORE_TOP_OF_STACK %r11 614 RESTORE_EXTRA_REGS
615 movq %rax,RAX(%rsp) 615
616 RESTORE_REST 616 testl $3,CS(%rsp) # from kernel_thread?
617
618 /*
619 * By the time we get here, we have no idea whether our pt_regs,
620 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
621 * the slow path, or one of the ia32entry paths.
622 * Use IRET code path to return, since it can safely handle
623 * all of the above.
624 */
625 jnz int_ret_from_sys_call
626
627 /* We came from kernel_thread */
628 /* nb: we depend on RESTORE_EXTRA_REGS above */
629 movq %rbp, %rdi
630 call *%rbx
631 movl $0, RAX(%rsp)
632 RESTORE_EXTRA_REGS
617 jmp int_ret_from_sys_call 633 jmp int_ret_from_sys_call
618 CFI_ENDPROC 634 CFI_ENDPROC
619END(stub_x32_execveat) 635END(ret_from_fork)
620
621#endif
622 636
623/* 637/*
624 * Build the entry stubs and pointer table with some assembler magic. 638 * Build the entry stubs with some assembler magic.
625 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 639 * We pack 1 stub into every 8-byte block.
626 * single cache line on all modern x86 implementations.
627 */ 640 */
628 .section .init.rodata,"a" 641 .align 8
629ENTRY(interrupt)
630 .section .entry.text
631 .p2align 5
632 .p2align CONFIG_X86_L1_CACHE_SHIFT
633ENTRY(irq_entries_start) 642ENTRY(irq_entries_start)
634 INTR_FRAME 643 INTR_FRAME
635vector=FIRST_EXTERNAL_VECTOR 644 vector=FIRST_EXTERNAL_VECTOR
636.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 645 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
637 .balign 32 646 pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
638 .rept 7 647 vector=vector+1
639 .if vector < FIRST_SYSTEM_VECTOR 648 jmp common_interrupt
640 .if vector <> FIRST_EXTERNAL_VECTOR
641 CFI_ADJUST_CFA_OFFSET -8 649 CFI_ADJUST_CFA_OFFSET -8
642 .endif 650 .align 8
6431: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ 651 .endr
644 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
645 jmp 2f
646 .endif
647 .previous
648 .quad 1b
649 .section .entry.text
650vector=vector+1
651 .endif
652 .endr
6532: jmp common_interrupt
654.endr
655 CFI_ENDPROC 652 CFI_ENDPROC
656END(irq_entries_start) 653END(irq_entries_start)
657 654
658.previous
659END(interrupt)
660.previous
661
662/* 655/*
663 * Interrupt entry/exit. 656 * Interrupt entry/exit.
664 * 657 *
@@ -669,47 +662,45 @@ END(interrupt)
669 662
670/* 0(%rsp): ~(interrupt number) */ 663/* 0(%rsp): ~(interrupt number) */
671 .macro interrupt func 664 .macro interrupt func
672 /* reserve pt_regs for scratch regs and rbp */
673 subq $ORIG_RAX-RBP, %rsp
674 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
675 cld 665 cld
676 /* start from rbp in pt_regs and jump over */ 666 /*
677 movq_cfi rdi, (RDI-RBP) 667 * Since nothing in interrupt handling code touches r12...r15 members
678 movq_cfi rsi, (RSI-RBP) 668 * of "struct pt_regs", and since interrupts can nest, we can save
679 movq_cfi rdx, (RDX-RBP) 669 * four stack slots and simultaneously provide
680 movq_cfi rcx, (RCX-RBP) 670 * an unwind-friendly stack layout by saving "truncated" pt_regs
681 movq_cfi rax, (RAX-RBP) 671 * exactly up to rbp slot, without these members.
682 movq_cfi r8, (R8-RBP) 672 */
683 movq_cfi r9, (R9-RBP) 673 ALLOC_PT_GPREGS_ON_STACK -RBP
684 movq_cfi r10, (R10-RBP) 674 SAVE_C_REGS -RBP
685 movq_cfi r11, (R11-RBP) 675 /* this goes to 0(%rsp) for unwinder, not for saving the value: */
686 676 SAVE_EXTRA_REGS_RBP -RBP
687 /* Save rbp so that we can unwind from get_irq_regs() */ 677
688 movq_cfi rbp, 0 678 leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
689
690 /* Save previous stack value */
691 movq %rsp, %rsi
692 679
693 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 680 testl $3, CS-RBP(%rsp)
694 testl $3, CS-RBP(%rsi)
695 je 1f 681 je 1f
696 SWAPGS 682 SWAPGS
6831:
697 /* 684 /*
685 * Save previous stack pointer, optionally switch to interrupt stack.
698 * irq_count is used to check if a CPU is already on an interrupt stack 686 * irq_count is used to check if a CPU is already on an interrupt stack
699 * or not. While this is essentially redundant with preempt_count it is 687 * or not. While this is essentially redundant with preempt_count it is
700 * a little cheaper to use a separate counter in the PDA (short of 688 * a little cheaper to use a separate counter in the PDA (short of
701 * moving irq_enter into assembly, which would be too much work) 689 * moving irq_enter into assembly, which would be too much work)
702 */ 690 */
7031: incl PER_CPU_VAR(irq_count) 691 movq %rsp, %rsi
692 incl PER_CPU_VAR(irq_count)
704 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 693 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
705 CFI_DEF_CFA_REGISTER rsi 694 CFI_DEF_CFA_REGISTER rsi
706
707 /* Store previous stack value */
708 pushq %rsi 695 pushq %rsi
696 /*
697 * For debugger:
698 * "CFA (Current Frame Address) is the value on stack + offset"
699 */
709 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 700 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
710 0x77 /* DW_OP_breg7 */, 0, \ 701 0x77 /* DW_OP_breg7 (rsp) */, 0, \
711 0x06 /* DW_OP_deref */, \ 702 0x06 /* DW_OP_deref */, \
712 0x08 /* DW_OP_const1u */, SS+8-RBP, \ 703 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
713 0x22 /* DW_OP_plus */ 704 0x22 /* DW_OP_plus */
714 /* We entered an interrupt context - irqs are off: */ 705 /* We entered an interrupt context - irqs are off: */
715 TRACE_IRQS_OFF 706 TRACE_IRQS_OFF
@@ -727,7 +718,7 @@ common_interrupt:
727 ASM_CLAC 718 ASM_CLAC
728 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 719 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
729 interrupt do_IRQ 720 interrupt do_IRQ
730 /* 0(%rsp): old_rsp-ARGOFFSET */ 721 /* 0(%rsp): old RSP */
731ret_from_intr: 722ret_from_intr:
732 DISABLE_INTERRUPTS(CLBR_NONE) 723 DISABLE_INTERRUPTS(CLBR_NONE)
733 TRACE_IRQS_OFF 724 TRACE_IRQS_OFF
@@ -735,19 +726,18 @@ ret_from_intr:
735 726
736 /* Restore saved previous stack */ 727 /* Restore saved previous stack */
737 popq %rsi 728 popq %rsi
738 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ 729 CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
739 leaq ARGOFFSET-RBP(%rsi), %rsp 730 /* return code expects complete pt_regs - adjust rsp accordingly: */
731 leaq -RBP(%rsi),%rsp
740 CFI_DEF_CFA_REGISTER rsp 732 CFI_DEF_CFA_REGISTER rsp
741 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 733 CFI_ADJUST_CFA_OFFSET RBP
742 734
743exit_intr: 735 testl $3,CS(%rsp)
744 GET_THREAD_INFO(%rcx)
745 testl $3,CS-ARGOFFSET(%rsp)
746 je retint_kernel 736 je retint_kernel
747
748 /* Interrupt came from user space */ 737 /* Interrupt came from user space */
738
739 GET_THREAD_INFO(%rcx)
749 /* 740 /*
750 * Has a correct top of stack, but a partial stack frame
751 * %rcx: thread info. Interrupts off. 741 * %rcx: thread info. Interrupts off.
752 */ 742 */
753retint_with_reschedule: 743retint_with_reschedule:
@@ -766,70 +756,34 @@ retint_swapgs: /* return to user-space */
766 DISABLE_INTERRUPTS(CLBR_ANY) 756 DISABLE_INTERRUPTS(CLBR_ANY)
767 TRACE_IRQS_IRETQ 757 TRACE_IRQS_IRETQ
768 758
769 /*
770 * Try to use SYSRET instead of IRET if we're returning to
771 * a completely clean 64-bit userspace context.
772 */
773 movq (RCX-R11)(%rsp), %rcx
774 cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
775 jne opportunistic_sysret_failed
776
777 /*
778 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
779 * in kernel space. This essentially lets the user take over
780 * the kernel, since userspace controls RSP. It's not worth
781 * testing for canonicalness exactly -- this check detects any
782 * of the 17 high bits set, which is true for non-canonical
783 * or kernel addresses. (This will pessimize vsyscall=native.
784 * Big deal.)
785 *
786 * If virtual addresses ever become wider, this will need
787 * to be updated to remain correct on both old and new CPUs.
788 */
789 .ifne __VIRTUAL_MASK_SHIFT - 47
790 .error "virtual address width changed -- sysret checks need update"
791 .endif
792 shr $__VIRTUAL_MASK_SHIFT, %rcx
793 jnz opportunistic_sysret_failed
794
795 cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
796 jne opportunistic_sysret_failed
797
798 movq (R11-ARGOFFSET)(%rsp), %r11
799 cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
800 jne opportunistic_sysret_failed
801
802 testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
803 jnz opportunistic_sysret_failed
804
805 /* nothing to check for RSP */
806
807 cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
808 jne opportunistic_sysret_failed
809
810 /*
811 * We win! This label is here just for ease of understanding
812 * perf profiles. Nothing jumps here.
813 */
814irq_return_via_sysret:
815 CFI_REMEMBER_STATE
816 RESTORE_ARGS 1,8,1
817 movq (RSP-RIP)(%rsp),%rsp
818 USERGS_SYSRET64
819 CFI_RESTORE_STATE
820
821opportunistic_sysret_failed:
822 SWAPGS 759 SWAPGS
823 jmp restore_args 760 jmp restore_c_regs_and_iret
824 761
825retint_restore_args: /* return to kernel space */ 762/* Returning to kernel space */
826 DISABLE_INTERRUPTS(CLBR_ANY) 763retint_kernel:
764#ifdef CONFIG_PREEMPT
765 /* Interrupts are off */
766 /* Check if we need preemption */
767 bt $9,EFLAGS(%rsp) /* interrupts were off? */
768 jnc 1f
7690: cmpl $0,PER_CPU_VAR(__preempt_count)
770 jnz 1f
771 call preempt_schedule_irq
772 jmp 0b
7731:
774#endif
827 /* 775 /*
828 * The iretq could re-enable interrupts: 776 * The iretq could re-enable interrupts:
829 */ 777 */
830 TRACE_IRQS_IRETQ 778 TRACE_IRQS_IRETQ
831restore_args: 779
832 RESTORE_ARGS 1,8,1 780/*
781 * At this label, code paths which return to kernel and to user,
782 * which come from interrupts/exception and from syscalls, merge.
783 */
784restore_c_regs_and_iret:
785 RESTORE_C_REGS
786 REMOVE_PT_GPREGS_FROM_STACK 8
833 787
834irq_return: 788irq_return:
835 INTERRUPT_RETURN 789 INTERRUPT_RETURN
@@ -900,28 +854,17 @@ retint_signal:
900 jz retint_swapgs 854 jz retint_swapgs
901 TRACE_IRQS_ON 855 TRACE_IRQS_ON
902 ENABLE_INTERRUPTS(CLBR_NONE) 856 ENABLE_INTERRUPTS(CLBR_NONE)
903 SAVE_REST 857 SAVE_EXTRA_REGS
904 movq $-1,ORIG_RAX(%rsp) 858 movq $-1,ORIG_RAX(%rsp)
905 xorl %esi,%esi # oldset 859 xorl %esi,%esi # oldset
906 movq %rsp,%rdi # &pt_regs 860 movq %rsp,%rdi # &pt_regs
907 call do_notify_resume 861 call do_notify_resume
908 RESTORE_REST 862 RESTORE_EXTRA_REGS
909 DISABLE_INTERRUPTS(CLBR_NONE) 863 DISABLE_INTERRUPTS(CLBR_NONE)
910 TRACE_IRQS_OFF 864 TRACE_IRQS_OFF
911 GET_THREAD_INFO(%rcx) 865 GET_THREAD_INFO(%rcx)
912 jmp retint_with_reschedule 866 jmp retint_with_reschedule
913 867
914#ifdef CONFIG_PREEMPT
915 /* Returning to kernel space. Check if we need preemption */
916 /* rcx: threadinfo. interrupts off. */
917ENTRY(retint_kernel)
918 cmpl $0,PER_CPU_VAR(__preempt_count)
919 jnz retint_restore_args
920 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
921 jnc retint_restore_args
922 call preempt_schedule_irq
923 jmp exit_intr
924#endif
925 CFI_ENDPROC 868 CFI_ENDPROC
926END(common_interrupt) 869END(common_interrupt)
927 870
@@ -1010,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \
1010/* 953/*
1011 * Exception entry points. 954 * Exception entry points.
1012 */ 955 */
1013#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 956#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
1014 957
1015.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 958.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
1016ENTRY(\sym) 959ENTRY(\sym)
@@ -1032,8 +975,7 @@ ENTRY(\sym)
1032 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 975 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1033 .endif 976 .endif
1034 977
1035 subq $ORIG_RAX-R15, %rsp 978 ALLOC_PT_GPREGS_ON_STACK
1036 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1037 979
1038 .if \paranoid 980 .if \paranoid
1039 .if \paranoid == 1 981 .if \paranoid == 1
@@ -1041,10 +983,11 @@ ENTRY(\sym)
1041 testl $3, CS(%rsp) /* If coming from userspace, switch */ 983 testl $3, CS(%rsp) /* If coming from userspace, switch */
1042 jnz 1f /* stacks. */ 984 jnz 1f /* stacks. */
1043 .endif 985 .endif
1044 call save_paranoid 986 call paranoid_entry
1045 .else 987 .else
1046 call error_entry 988 call error_entry
1047 .endif 989 .endif
990 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
1048 991
1049 DEFAULT_FRAME 0 992 DEFAULT_FRAME 0
1050 993
@@ -1066,19 +1009,20 @@ ENTRY(\sym)
1066 .endif 1009 .endif
1067 1010
1068 .if \shift_ist != -1 1011 .if \shift_ist != -1
1069 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) 1012 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1070 .endif 1013 .endif
1071 1014
1072 call \do_sym 1015 call \do_sym
1073 1016
1074 .if \shift_ist != -1 1017 .if \shift_ist != -1
1075 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) 1018 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1076 .endif 1019 .endif
1077 1020
1021 /* these procedures expect "no swapgs" flag in ebx */
1078 .if \paranoid 1022 .if \paranoid
1079 jmp paranoid_exit /* %ebx: no swapgs flag */ 1023 jmp paranoid_exit
1080 .else 1024 .else
1081 jmp error_exit /* %ebx: no swapgs flag */ 1025 jmp error_exit
1082 .endif 1026 .endif
1083 1027
1084 .if \paranoid == 1 1028 .if \paranoid == 1
@@ -1282,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
1282 addq $0x30,%rsp 1226 addq $0x30,%rsp
1283 CFI_ADJUST_CFA_OFFSET -0x30 1227 CFI_ADJUST_CFA_OFFSET -0x30
1284 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ 1228 pushq_cfi $-1 /* orig_ax = -1 => not a system call */
1285 SAVE_ALL 1229 ALLOC_PT_GPREGS_ON_STACK
1230 SAVE_C_REGS
1231 SAVE_EXTRA_REGS
1286 jmp error_exit 1232 jmp error_exit
1287 CFI_ENDPROC 1233 CFI_ENDPROC
1288END(xen_failsafe_callback) 1234END(xen_failsafe_callback)
@@ -1314,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
1314idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) 1260idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
1315#endif 1261#endif
1316 1262
1317 /* 1263/*
1318 * "Paranoid" exit path from exception stack. This is invoked 1264 * Save all registers in pt_regs, and switch gs if needed.
1319 * only on return from non-NMI IST interrupts that came 1265 * Use slow, but surefire "are we in kernel?" check.
1320 * from kernel space. 1266 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1321 * 1267 */
1322 * We may be returning to very strange contexts (e.g. very early 1268ENTRY(paranoid_entry)
1323 * in syscall entry), so checking for preemption here would 1269 XCPT_FRAME 1 15*8
1324 * be complicated. Fortunately, we there's no good reason 1270 cld
1325 * to try to handle preemption here. 1271 SAVE_C_REGS 8
1326 */ 1272 SAVE_EXTRA_REGS 8
1273 movl $1,%ebx
1274 movl $MSR_GS_BASE,%ecx
1275 rdmsr
1276 testl %edx,%edx
1277 js 1f /* negative -> in kernel */
1278 SWAPGS
1279 xorl %ebx,%ebx
12801: ret
1281 CFI_ENDPROC
1282END(paranoid_entry)
1327 1283
1328 /* ebx: no swapgs flag */ 1284/*
1285 * "Paranoid" exit path from exception stack. This is invoked
1286 * only on return from non-NMI IST interrupts that came
1287 * from kernel space.
1288 *
1289 * We may be returning to very strange contexts (e.g. very early
1290 * in syscall entry), so checking for preemption here would
1291 * be complicated. Fortunately, we there's no good reason
1292 * to try to handle preemption here.
1293 */
1294/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
1329ENTRY(paranoid_exit) 1295ENTRY(paranoid_exit)
1330 DEFAULT_FRAME 1296 DEFAULT_FRAME
1331 DISABLE_INTERRUPTS(CLBR_NONE) 1297 DISABLE_INTERRUPTS(CLBR_NONE)
1332 TRACE_IRQS_OFF_DEBUG 1298 TRACE_IRQS_OFF_DEBUG
1333 testl %ebx,%ebx /* swapgs needed? */ 1299 testl %ebx,%ebx /* swapgs needed? */
1334 jnz paranoid_restore 1300 jnz paranoid_exit_no_swapgs
1335 TRACE_IRQS_IRETQ 0 1301 TRACE_IRQS_IRETQ
1336 SWAPGS_UNSAFE_STACK 1302 SWAPGS_UNSAFE_STACK
1337 RESTORE_ALL 8 1303 jmp paranoid_exit_restore
1338 INTERRUPT_RETURN 1304paranoid_exit_no_swapgs:
1339paranoid_restore: 1305 TRACE_IRQS_IRETQ_DEBUG
1340 TRACE_IRQS_IRETQ_DEBUG 0 1306paranoid_exit_restore:
1341 RESTORE_ALL 8 1307 RESTORE_EXTRA_REGS
1308 RESTORE_C_REGS
1309 REMOVE_PT_GPREGS_FROM_STACK 8
1342 INTERRUPT_RETURN 1310 INTERRUPT_RETURN
1343 CFI_ENDPROC 1311 CFI_ENDPROC
1344END(paranoid_exit) 1312END(paranoid_exit)
1345 1313
1346/* 1314/*
1347 * Exception entry point. This expects an error code/orig_rax on the stack. 1315 * Save all registers in pt_regs, and switch gs if needed.
1348 * returns in "no swapgs flag" in %ebx. 1316 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1349 */ 1317 */
1350ENTRY(error_entry) 1318ENTRY(error_entry)
1351 XCPT_FRAME 1319 XCPT_FRAME 1 15*8
1352 CFI_ADJUST_CFA_OFFSET 15*8
1353 /* oldrax contains error code */
1354 cld 1320 cld
1355 movq %rdi, RDI+8(%rsp) 1321 SAVE_C_REGS 8
1356 movq %rsi, RSI+8(%rsp) 1322 SAVE_EXTRA_REGS 8
1357 movq %rdx, RDX+8(%rsp)
1358 movq %rcx, RCX+8(%rsp)
1359 movq %rax, RAX+8(%rsp)
1360 movq %r8, R8+8(%rsp)
1361 movq %r9, R9+8(%rsp)
1362 movq %r10, R10+8(%rsp)
1363 movq %r11, R11+8(%rsp)
1364 movq_cfi rbx, RBX+8
1365 movq %rbp, RBP+8(%rsp)
1366 movq %r12, R12+8(%rsp)
1367 movq %r13, R13+8(%rsp)
1368 movq %r14, R14+8(%rsp)
1369 movq %r15, R15+8(%rsp)
1370 xorl %ebx,%ebx 1323 xorl %ebx,%ebx
1371 testl $3,CS+8(%rsp) 1324 testl $3,CS+8(%rsp)
1372 je error_kernelspace 1325 je error_kernelspace
@@ -1376,12 +1329,12 @@ error_sti:
1376 TRACE_IRQS_OFF 1329 TRACE_IRQS_OFF
1377 ret 1330 ret
1378 1331
1379/* 1332 /*
1380 * There are two places in the kernel that can potentially fault with 1333 * There are two places in the kernel that can potentially fault with
1381 * usergs. Handle them here. B stepping K8s sometimes report a 1334 * usergs. Handle them here. B stepping K8s sometimes report a
1382 * truncated RIP for IRET exceptions returning to compat mode. Check 1335 * truncated RIP for IRET exceptions returning to compat mode. Check
1383 * for these here too. 1336 * for these here too.
1384 */ 1337 */
1385error_kernelspace: 1338error_kernelspace:
1386 CFI_REL_OFFSET rcx, RCX+8 1339 CFI_REL_OFFSET rcx, RCX+8
1387 incl %ebx 1340 incl %ebx
@@ -1411,11 +1364,11 @@ error_bad_iret:
1411END(error_entry) 1364END(error_entry)
1412 1365
1413 1366
1414/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ 1367/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
1415ENTRY(error_exit) 1368ENTRY(error_exit)
1416 DEFAULT_FRAME 1369 DEFAULT_FRAME
1417 movl %ebx,%eax 1370 movl %ebx,%eax
1418 RESTORE_REST 1371 RESTORE_EXTRA_REGS
1419 DISABLE_INTERRUPTS(CLBR_NONE) 1372 DISABLE_INTERRUPTS(CLBR_NONE)
1420 TRACE_IRQS_OFF 1373 TRACE_IRQS_OFF
1421 GET_THREAD_INFO(%rcx) 1374 GET_THREAD_INFO(%rcx)
@@ -1430,19 +1383,7 @@ ENTRY(error_exit)
1430 CFI_ENDPROC 1383 CFI_ENDPROC
1431END(error_exit) 1384END(error_exit)
1432 1385
1433/* 1386/* Runs on exception stack */
1434 * Test if a given stack is an NMI stack or not.
1435 */
1436 .macro test_in_nmi reg stack nmi_ret normal_ret
1437 cmpq %\reg, \stack
1438 ja \normal_ret
1439 subq $EXCEPTION_STKSZ, %\reg
1440 cmpq %\reg, \stack
1441 jb \normal_ret
1442 jmp \nmi_ret
1443 .endm
1444
1445 /* runs on exception stack */
1446ENTRY(nmi) 1387ENTRY(nmi)
1447 INTR_FRAME 1388 INTR_FRAME
1448 PARAVIRT_ADJUST_EXCEPTION_FRAME 1389 PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1478,7 +1419,7 @@ ENTRY(nmi)
1478 * NMI. 1419 * NMI.
1479 */ 1420 */
1480 1421
1481 /* Use %rdx as out temp variable throughout */ 1422 /* Use %rdx as our temp variable throughout */
1482 pushq_cfi %rdx 1423 pushq_cfi %rdx
1483 CFI_REL_OFFSET rdx, 0 1424 CFI_REL_OFFSET rdx, 0
1484 1425
@@ -1503,8 +1444,17 @@ ENTRY(nmi)
1503 * We check the variable because the first NMI could be in a 1444 * We check the variable because the first NMI could be in a
1504 * breakpoint routine using a breakpoint stack. 1445 * breakpoint routine using a breakpoint stack.
1505 */ 1446 */
1506 lea 6*8(%rsp), %rdx 1447 lea 6*8(%rsp), %rdx
1507 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi 1448 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1449 cmpq %rdx, 4*8(%rsp)
1450 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1451 ja first_nmi
1452 subq $EXCEPTION_STKSZ, %rdx
1453 cmpq %rdx, 4*8(%rsp)
1454 /* If it is below the NMI stack, it is a normal NMI */
1455 jb first_nmi
1456 /* Ah, it is within the NMI stack, treat it as nested */
1457
1508 CFI_REMEMBER_STATE 1458 CFI_REMEMBER_STATE
1509 1459
1510nested_nmi: 1460nested_nmi:
@@ -1597,7 +1547,7 @@ first_nmi:
1597 .rept 5 1547 .rept 5
1598 pushq_cfi 11*8(%rsp) 1548 pushq_cfi 11*8(%rsp)
1599 .endr 1549 .endr
1600 CFI_DEF_CFA_OFFSET SS+8-RIP 1550 CFI_DEF_CFA_OFFSET 5*8
1601 1551
1602 /* Everything up to here is safe from nested NMIs */ 1552 /* Everything up to here is safe from nested NMIs */
1603 1553
@@ -1625,7 +1575,7 @@ repeat_nmi:
1625 pushq_cfi -6*8(%rsp) 1575 pushq_cfi -6*8(%rsp)
1626 .endr 1576 .endr
1627 subq $(5*8), %rsp 1577 subq $(5*8), %rsp
1628 CFI_DEF_CFA_OFFSET SS+8-RIP 1578 CFI_DEF_CFA_OFFSET 5*8
1629end_repeat_nmi: 1579end_repeat_nmi:
1630 1580
1631 /* 1581 /*
@@ -1634,16 +1584,16 @@ end_repeat_nmi:
1634 * so that we repeat another NMI. 1584 * so that we repeat another NMI.
1635 */ 1585 */
1636 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1586 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1637 subq $ORIG_RAX-R15, %rsp 1587 ALLOC_PT_GPREGS_ON_STACK
1638 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1588
1639 /* 1589 /*
1640 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit 1590 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1641 * as we should not be calling schedule in NMI context. 1591 * as we should not be calling schedule in NMI context.
1642 * Even with normal interrupts enabled. An NMI should not be 1592 * Even with normal interrupts enabled. An NMI should not be
1643 * setting NEED_RESCHED or anything that normal interrupts and 1593 * setting NEED_RESCHED or anything that normal interrupts and
1644 * exceptions might do. 1594 * exceptions might do.
1645 */ 1595 */
1646 call save_paranoid 1596 call paranoid_entry
1647 DEFAULT_FRAME 0 1597 DEFAULT_FRAME 0
1648 1598
1649 /* 1599 /*
@@ -1674,8 +1624,10 @@ end_repeat_nmi:
1674nmi_swapgs: 1624nmi_swapgs:
1675 SWAPGS_UNSAFE_STACK 1625 SWAPGS_UNSAFE_STACK
1676nmi_restore: 1626nmi_restore:
1627 RESTORE_EXTRA_REGS
1628 RESTORE_C_REGS
1677 /* Pop the extra iret frame at once */ 1629 /* Pop the extra iret frame at once */
1678 RESTORE_ALL 6*8 1630 REMOVE_PT_GPREGS_FROM_STACK 6*8
1679 1631
1680 /* Clear the NMI executing stack variable */ 1632 /* Clear the NMI executing stack variable */
1681 movq $0, 5*8(%rsp) 1633 movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c4f8d4659070..2b55ee6db053 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -177,9 +177,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
177 */ 177 */
178 load_ucode_bsp(); 178 load_ucode_bsp();
179 179
180 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
181 early_printk("Kernel alive\n");
182
183 clear_page(init_level4_pgt); 180 clear_page(init_level4_pgt);
184 /* set init_level4_pgt kernel high mapping*/ 181 /* set init_level4_pgt kernel high mapping*/
185 init_level4_pgt[511] = early_level4_pgt[511]; 182 init_level4_pgt[511] = early_level4_pgt[511];
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42d6f0c..d031bad9e07e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/percpu.h> 23#include <asm/percpu.h>
24#include <asm/nops.h> 24#include <asm/nops.h>
25#include <asm/bootparam.h>
25 26
26/* Physical address */ 27/* Physical address */
27#define pa(X) ((X) - __PAGE_OFFSET) 28#define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
90 91
91 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 92 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
92 us to not reload segments */ 93 us to not reload segments */
93 testb $(1<<6), BP_loadflags(%esi) 94 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
94 jnz 2f 95 jnz 2f
95 96
96/* 97/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6fd514d9f69a..ae6588b301c2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit 2 * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
3 * 3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
56 * %rsi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
57 * 57 *
58 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
59 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86/boot/compressed/head_64.S.
60 * 60 *
61 * We only come here initially at boot nothing else comes here. 61 * We only come here initially at boot nothing else comes here.
62 * 62 *
@@ -146,7 +146,7 @@ startup_64:
146 leaq level2_kernel_pgt(%rip), %rdi 146 leaq level2_kernel_pgt(%rip), %rdi
147 leaq 4096(%rdi), %r8 147 leaq 4096(%rdi), %r8
148 /* See if it is a valid page table entry */ 148 /* See if it is a valid page table entry */
1491: testq $1, 0(%rdi) 1491: testb $1, 0(%rdi)
150 jz 2f 150 jz 2f
151 addq %rbp, 0(%rdi) 151 addq %rbp, 0(%rdi)
152 /* Go to the next page */ 152 /* Go to the next page */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fce0b71..367f39d35e9c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -42,8 +42,8 @@ void kernel_fpu_enable(void)
42 * be set (so that the clts/stts pair does nothing that is 42 * be set (so that the clts/stts pair does nothing that is
43 * visible in the interrupted kernel thread). 43 * visible in the interrupted kernel thread).
44 * 44 *
45 * Except for the eagerfpu case when we return 1 unless we've already 45 * Except for the eagerfpu case when we return true; in the likely case
46 * been eager and saved the state in kernel_fpu_begin(). 46 * the thread has FPU but we are not going to set/clear TS.
47 */ 47 */
48static inline bool interrupted_kernel_fpu_idle(void) 48static inline bool interrupted_kernel_fpu_idle(void)
49{ 49{
@@ -51,7 +51,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
51 return false; 51 return false;
52 52
53 if (use_eager_fpu()) 53 if (use_eager_fpu())
54 return __thread_has_fpu(current); 54 return true;
55 55
56 return !__thread_has_fpu(current) && 56 return !__thread_has_fpu(current) &&
57 (read_cr0() & X86_CR0_TS); 57 (read_cr0() & X86_CR0_TS);
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
68static inline bool interrupted_user_mode(void) 68static inline bool interrupted_user_mode(void)
69{ 69{
70 struct pt_regs *regs = get_irq_regs(); 70 struct pt_regs *regs = get_irq_regs();
71 return regs && user_mode_vm(regs); 71 return regs && user_mode(regs);
72} 72}
73 73
74/* 74/*
@@ -94,9 +94,10 @@ void __kernel_fpu_begin(void)
94 94
95 if (__thread_has_fpu(me)) { 95 if (__thread_has_fpu(me)) {
96 __save_init_fpu(me); 96 __save_init_fpu(me);
97 } else if (!use_eager_fpu()) { 97 } else {
98 this_cpu_write(fpu_owner_task, NULL); 98 this_cpu_write(fpu_owner_task, NULL);
99 clts(); 99 if (!use_eager_fpu())
100 clts();
100 } 101 }
101} 102}
102EXPORT_SYMBOL(__kernel_fpu_begin); 103EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -107,7 +108,7 @@ void __kernel_fpu_end(void)
107 108
108 if (__thread_has_fpu(me)) { 109 if (__thread_has_fpu(me)) {
109 if (WARN_ON(restore_fpu_checking(me))) 110 if (WARN_ON(restore_fpu_checking(me)))
110 drop_init_fpu(me); 111 fpu_reset_state(me);
111 } else if (!use_eager_fpu()) { 112 } else if (!use_eager_fpu()) {
112 stts(); 113 stts();
113 } 114 }
@@ -120,10 +121,13 @@ void unlazy_fpu(struct task_struct *tsk)
120{ 121{
121 preempt_disable(); 122 preempt_disable();
122 if (__thread_has_fpu(tsk)) { 123 if (__thread_has_fpu(tsk)) {
123 __save_init_fpu(tsk); 124 if (use_eager_fpu()) {
124 __thread_fpu_end(tsk); 125 __save_fpu(tsk);
125 } else 126 } else {
126 tsk->thread.fpu_counter = 0; 127 __save_init_fpu(tsk);
128 __thread_fpu_end(tsk);
129 }
130 }
127 preempt_enable(); 131 preempt_enable();
128} 132}
129EXPORT_SYMBOL(unlazy_fpu); 133EXPORT_SYMBOL(unlazy_fpu);
@@ -221,11 +225,12 @@ void fpu_finit(struct fpu *fpu)
221 return; 225 return;
222 } 226 }
223 227
228 memset(fpu->state, 0, xstate_size);
229
224 if (cpu_has_fxsr) { 230 if (cpu_has_fxsr) {
225 fx_finit(&fpu->state->fxsave); 231 fx_finit(&fpu->state->fxsave);
226 } else { 232 } else {
227 struct i387_fsave_struct *fp = &fpu->state->fsave; 233 struct i387_fsave_struct *fp = &fpu->state->fsave;
228 memset(fp, 0, xstate_size);
229 fp->cwd = 0xffff037fu; 234 fp->cwd = 0xffff037fu;
230 fp->swd = 0xffff0000u; 235 fp->swd = 0xffff0000u;
231 fp->twd = 0xffffffffu; 236 fp->twd = 0xffffffffu;
@@ -247,7 +252,7 @@ int init_fpu(struct task_struct *tsk)
247 if (tsk_used_math(tsk)) { 252 if (tsk_used_math(tsk)) {
248 if (cpu_has_fpu && tsk == current) 253 if (cpu_has_fpu && tsk == current)
249 unlazy_fpu(tsk); 254 unlazy_fpu(tsk);
250 tsk->thread.fpu.last_cpu = ~0; 255 task_disable_lazy_fpu_restore(tsk);
251 return 0; 256 return 0;
252 } 257 }
253 258
@@ -336,6 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
336 unsigned int pos, unsigned int count, 341 unsigned int pos, unsigned int count,
337 void *kbuf, void __user *ubuf) 342 void *kbuf, void __user *ubuf)
338{ 343{
344 struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
339 int ret; 345 int ret;
340 346
341 if (!cpu_has_xsave) 347 if (!cpu_has_xsave)
@@ -350,14 +356,12 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
350 * memory layout in the thread struct, so that we can copy the entire 356 * memory layout in the thread struct, so that we can copy the entire
351 * xstateregs to the user using one user_regset_copyout(). 357 * xstateregs to the user using one user_regset_copyout().
352 */ 358 */
353 memcpy(&target->thread.fpu.state->fxsave.sw_reserved, 359 memcpy(&xsave->i387.sw_reserved,
354 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); 360 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
355
356 /* 361 /*
357 * Copy the xstate memory layout. 362 * Copy the xstate memory layout.
358 */ 363 */
359 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, 364 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
360 &target->thread.fpu.state->xsave, 0, -1);
361 return ret; 365 return ret;
362} 366}
363 367
@@ -365,8 +369,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
365 unsigned int pos, unsigned int count, 369 unsigned int pos, unsigned int count,
366 const void *kbuf, const void __user *ubuf) 370 const void *kbuf, const void __user *ubuf)
367{ 371{
372 struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
368 int ret; 373 int ret;
369 struct xsave_hdr_struct *xsave_hdr;
370 374
371 if (!cpu_has_xsave) 375 if (!cpu_has_xsave)
372 return -ENODEV; 376 return -ENODEV;
@@ -375,22 +379,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
375 if (ret) 379 if (ret)
376 return ret; 380 return ret;
377 381
378 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 382 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
379 &target->thread.fpu.state->xsave, 0, -1);
380
381 /* 383 /*
382 * mxcsr reserved bits must be masked to zero for security reasons. 384 * mxcsr reserved bits must be masked to zero for security reasons.
383 */ 385 */
384 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; 386 xsave->i387.mxcsr &= mxcsr_feature_mask;
385 387 xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
386 xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
387
388 xsave_hdr->xstate_bv &= pcntxt_mask;
389 /* 388 /*
390 * These bits must be zero. 389 * These bits must be zero.
391 */ 390 */
392 memset(xsave_hdr->reserved, 0, 48); 391 memset(&xsave->xsave_hdr.reserved, 0, 48);
393
394 return ret; 392 return ret;
395} 393}
396 394
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66ea35f..37dae792dbbe 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
54 * because the ->io_bitmap_max value must match the bitmap 54 * because the ->io_bitmap_max value must match the bitmap
55 * contents: 55 * contents:
56 */ 56 */
57 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(cpu_tss, get_cpu());
58 58
59 if (turn_on) 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num); 60 bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 67b1cbe0093a..e5952c225532 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void)
295 295
296 this_cpu = smp_processor_id(); 296 this_cpu = smp_processor_id();
297 cpumask_copy(&online_new, cpu_online_mask); 297 cpumask_copy(&online_new, cpu_online_mask);
298 cpu_clear(this_cpu, online_new); 298 cpumask_clear_cpu(this_cpu, &online_new);
299 299
300 this_count = 0; 300 this_count = 0;
301 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 301 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void)
307 307
308 data = irq_desc_get_irq_data(desc); 308 data = irq_desc_get_irq_data(desc);
309 cpumask_copy(&affinity_new, data->affinity); 309 cpumask_copy(&affinity_new, data->affinity);
310 cpu_clear(this_cpu, affinity_new); 310 cpumask_clear_cpu(this_cpu, &affinity_new);
311 311
312 /* Do not count inactive or per-cpu irqs. */ 312 /* Do not count inactive or per-cpu irqs. */
313 if (!irq_has_action(irq) || irqd_is_per_cpu(data)) 313 if (!irq_has_action(irq) || irqd_is_per_cpu(data))
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 28d28f5eb8f4..f9fd86a7fcc7 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
165 if (unlikely(!desc)) 165 if (unlikely(!desc))
166 return false; 166 return false;
167 167
168 if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { 168 if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
169 if (unlikely(overflow)) 169 if (unlikely(overflow))
170 print_stack_overflow(); 170 print_stack_overflow();
171 desc->handle_irq(irq, desc); 171 desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e4b503d5558c..394e643d7830 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
44 u64 estack_top, estack_bottom; 44 u64 estack_top, estack_bottom;
45 u64 curbase = (u64)task_stack_page(current); 45 u64 curbase = (u64)task_stack_page(current);
46 46
47 if (user_mode_vm(regs)) 47 if (user_mode(regs))
48 return; 48 return;
49 49
50 if (regs->sp >= curbase + sizeof(struct thread_info) + 50 if (regs->sp >= curbase + sizeof(struct thread_info) +
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181ea1eac..cd10a6437264 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void)
178#endif 178#endif
179 for_each_clear_bit_from(i, used_vectors, first_system_vector) { 179 for_each_clear_bit_from(i, used_vectors, first_system_vector) {
180 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ 180 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
181 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 181 set_intr_gate(i, irq_entries_start +
182 8 * (i - FIRST_EXTERNAL_VECTOR));
182 } 183 }
183#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
184 for_each_clear_bit_from(i, used_vectors, NR_VECTORS) 185 for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 7ec1d5f8d283..d6178d9791db 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -72,7 +72,7 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
72 { "bx", 8, offsetof(struct pt_regs, bx) }, 72 { "bx", 8, offsetof(struct pt_regs, bx) },
73 { "cx", 8, offsetof(struct pt_regs, cx) }, 73 { "cx", 8, offsetof(struct pt_regs, cx) },
74 { "dx", 8, offsetof(struct pt_regs, dx) }, 74 { "dx", 8, offsetof(struct pt_regs, dx) },
75 { "si", 8, offsetof(struct pt_regs, dx) }, 75 { "si", 8, offsetof(struct pt_regs, si) },
76 { "di", 8, offsetof(struct pt_regs, di) }, 76 { "di", 8, offsetof(struct pt_regs, di) },
77 { "bp", 8, offsetof(struct pt_regs, bp) }, 77 { "bp", 8, offsetof(struct pt_regs, bp) },
78 { "sp", 8, offsetof(struct pt_regs, sp) }, 78 { "sp", 8, offsetof(struct pt_regs, sp) },
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
126#ifdef CONFIG_X86_32 126#ifdef CONFIG_X86_32
127 switch (regno) { 127 switch (regno) {
128 case GDB_SS: 128 case GDB_SS:
129 if (!user_mode_vm(regs)) 129 if (!user_mode(regs))
130 *(unsigned long *)mem = __KERNEL_DS; 130 *(unsigned long *)mem = __KERNEL_DS;
131 break; 131 break;
132 case GDB_SP: 132 case GDB_SP:
133 if (!user_mode_vm(regs)) 133 if (!user_mode(regs))
134 *(unsigned long *)mem = kernel_stack_pointer(regs); 134 *(unsigned long *)mem = kernel_stack_pointer(regs);
135 break; 135 break;
136 case GDB_GS: 136 case GDB_GS:
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 03189d86357d..1deffe6cc873 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -605,7 +605,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
605 struct kprobe *p; 605 struct kprobe *p;
606 struct kprobe_ctlblk *kcb; 606 struct kprobe_ctlblk *kcb;
607 607
608 if (user_mode_vm(regs)) 608 if (user_mode(regs))
609 return 0; 609 return 0;
610 610
611 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 611 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1010,7 +1010,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
1010 struct die_args *args = data; 1010 struct die_args *args = data;
1011 int ret = NOTIFY_DONE; 1011 int ret = NOTIFY_DONE;
1012 1012
1013 if (args->regs && user_mode_vm(args->regs)) 1013 if (args->regs && user_mode(args->regs))
1014 return ret; 1014 return ret;
1015 1015
1016 if (val == DIE_GPF) { 1016 if (val == DIE_GPF) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
513 * can get false positives too easily, for example if the host is 513 * can get false positives too easily, for example if the host is
514 * overcommitted. 514 * overcommitted.
515 */ 515 */
516 watchdog_enable_hardlockup_detector(false); 516 hardlockup_detector_disable();
517} 517}
518 518
519static noinline uint32_t __kvm_cpuid_base(void) 519static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index d1ac80b72c72..005c03e93fc5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -33,6 +33,7 @@
33 33
34#include <asm/page.h> 34#include <asm/page.h>
35#include <asm/pgtable.h> 35#include <asm/pgtable.h>
36#include <asm/setup.h>
36 37
37#if 0 38#if 0
38#define DEBUGP(fmt, ...) \ 39#define DEBUGP(fmt, ...) \
@@ -47,21 +48,13 @@ do { \
47 48
48#ifdef CONFIG_RANDOMIZE_BASE 49#ifdef CONFIG_RANDOMIZE_BASE
49static unsigned long module_load_offset; 50static unsigned long module_load_offset;
50static int randomize_modules = 1;
51 51
52/* Mutex protects the module_load_offset. */ 52/* Mutex protects the module_load_offset. */
53static DEFINE_MUTEX(module_kaslr_mutex); 53static DEFINE_MUTEX(module_kaslr_mutex);
54 54
55static int __init parse_nokaslr(char *p)
56{
57 randomize_modules = 0;
58 return 0;
59}
60early_param("nokaslr", parse_nokaslr);
61
62static unsigned long int get_module_load_offset(void) 55static unsigned long int get_module_load_offset(void)
63{ 56{
64 if (randomize_modules) { 57 if (kaslr_enabled()) {
65 mutex_lock(&module_kaslr_mutex); 58 mutex_lock(&module_kaslr_mutex);
66 /* 59 /*
67 * Calculate the module_load_offset the first time this 60 * Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..c614dd492f5f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
443 .ptep_modify_prot_start = __ptep_modify_prot_start, 443 .ptep_modify_prot_start = __ptep_modify_prot_start,
444 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 444 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
445 445
446#if PAGETABLE_LEVELS >= 3 446#if CONFIG_PGTABLE_LEVELS >= 3
447#ifdef CONFIG_X86_PAE 447#ifdef CONFIG_X86_PAE
448 .set_pte_atomic = native_set_pte_atomic, 448 .set_pte_atomic = native_set_pte_atomic,
449 .pte_clear = native_pte_clear, 449 .pte_clear = native_pte_clear,
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = {
454 .pmd_val = PTE_IDENT, 454 .pmd_val = PTE_IDENT,
455 .make_pmd = PTE_IDENT, 455 .make_pmd = PTE_IDENT,
456 456
457#if PAGETABLE_LEVELS == 4 457#if CONFIG_PGTABLE_LEVELS == 4
458 .pud_val = PTE_IDENT, 458 .pud_val = PTE_IDENT,
459 .make_pud = PTE_IDENT, 459 .make_pud = PTE_IDENT,
460 460
461 .set_pgd = native_set_pgd, 461 .set_pgd = native_set_pgd,
462#endif 462#endif
463#endif /* PAGETABLE_LEVELS >= 3 */ 463#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
464 464
465 .pte_val = PTE_IDENT, 465 .pte_val = PTE_IDENT,
466 .pgd_val = PTE_IDENT, 466 .pgd_val = PTE_IDENT,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861cc5ee8..da8cb987b973 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user,
131 } 131 }
132 132
133 /* 133 /*
134 * RIP, flags, and the argument registers are usually saved. 134 * These registers are always saved on 64-bit syscall entry.
135 * orig_ax is probably okay, too. 135 * On 32-bit entry points, they are saved too except r8..r11.
136 */ 136 */
137 regs_user_copy->ip = user_regs->ip; 137 regs_user_copy->ip = user_regs->ip;
138 regs_user_copy->ax = user_regs->ax;
138 regs_user_copy->cx = user_regs->cx; 139 regs_user_copy->cx = user_regs->cx;
139 regs_user_copy->dx = user_regs->dx; 140 regs_user_copy->dx = user_regs->dx;
140 regs_user_copy->si = user_regs->si; 141 regs_user_copy->si = user_regs->si;
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
145 regs_user_copy->r11 = user_regs->r11; 146 regs_user_copy->r11 = user_regs->r11;
146 regs_user_copy->orig_ax = user_regs->orig_ax; 147 regs_user_copy->orig_ax = user_regs->orig_ax;
147 regs_user_copy->flags = user_regs->flags; 148 regs_user_copy->flags = user_regs->flags;
149 regs_user_copy->sp = user_regs->sp;
150 regs_user_copy->cs = user_regs->cs;
151 regs_user_copy->ss = user_regs->ss;
148 152
149 /* 153 /*
150 * Don't even try to report the "rest" regs. 154 * Most system calls don't save these registers, don't report them.
151 */ 155 */
152 regs_user_copy->bx = -1; 156 regs_user_copy->bx = -1;
153 regs_user_copy->bp = -1; 157 regs_user_copy->bp = -1;
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user,
158 162
159 /* 163 /*
160 * For this to be at all useful, we need a reasonable guess for 164 * For this to be at all useful, we need a reasonable guess for
161 * sp and the ABI. Be careful: we're in NMI context, and we're 165 * the ABI. Be careful: we're in NMI context, and we're
162 * considering current to be the current task, so we should 166 * considering current to be the current task, so we should
163 * be careful not to look at any other percpu variables that might 167 * be careful not to look at any other percpu variables that might
164 * change during context switches. 168 * change during context switches.
165 */ 169 */
166 if (IS_ENABLED(CONFIG_IA32_EMULATION) && 170 regs_user->abi = user_64bit_mode(user_regs) ?
167 task_thread_info(current)->status & TS_COMPAT) { 171 PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
168 /* Easy case: we're in a compat syscall. */
169 regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
170 regs_user_copy->sp = user_regs->sp;
171 regs_user_copy->cs = user_regs->cs;
172 regs_user_copy->ss = user_regs->ss;
173 } else if (user_regs->orig_ax != -1) {
174 /*
175 * We're probably in a 64-bit syscall.
176 * Warning: this code is severely racy. At least it's better
177 * than just blindly copying user_regs.
178 */
179 regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
180 regs_user_copy->sp = this_cpu_read(old_rsp);
181 regs_user_copy->cs = __USER_CS;
182 regs_user_copy->ss = __USER_DS;
183 regs_user_copy->cx = -1; /* usually contains garbage */
184 } else {
185 /* We're probably in an interrupt or exception. */
186 regs_user->abi = user_64bit_mode(user_regs) ?
187 PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
188 regs_user_copy->sp = user_regs->sp;
189 regs_user_copy->cs = user_regs->cs;
190 regs_user_copy->ss = user_regs->ss;
191 }
192 172
193 regs_user->regs = regs_user_copy; 173 regs_user->regs = regs_user_copy;
194} 174}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d620bbe..8213da62b1b7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/pm.h> 11#include <linux/pm.h>
12#include <linux/clockchips.h> 12#include <linux/tick.h>
13#include <linux/random.h> 13#include <linux/random.h>
14#include <linux/user-return-notifier.h> 14#include <linux/user-return-notifier.h>
15#include <linux/dmi.h> 15#include <linux/dmi.h>
@@ -24,6 +24,7 @@
24#include <asm/syscalls.h> 24#include <asm/syscalls.h>
25#include <asm/idle.h> 25#include <asm/idle.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27#include <asm/mwait.h>
27#include <asm/i387.h> 28#include <asm/i387.h>
28#include <asm/fpu-internal.h> 29#include <asm/fpu-internal.h>
29#include <asm/debugreg.h> 30#include <asm/debugreg.h>
@@ -37,7 +38,26 @@
37 * section. Since TSS's are completely CPU-local, we want them 38 * section. Since TSS's are completely CPU-local, we want them
38 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 39 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
39 */ 40 */
40__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; 41__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
42 .x86_tss = {
43 .sp0 = TOP_OF_INIT_STACK,
44#ifdef CONFIG_X86_32
45 .ss0 = __KERNEL_DS,
46 .ss1 = __KERNEL_CS,
47 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
48#endif
49 },
50#ifdef CONFIG_X86_32
51 /*
52 * Note that the .io_bitmap member must be extra-big. This is because
53 * the CPU will access an additional byte beyond the end of the IO
54 * permission bitmap. The extra byte must be all 1 bits, and must
55 * be within the limit.
56 */
57 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
58#endif
59};
60EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
41 61
42#ifdef CONFIG_X86_64 62#ifdef CONFIG_X86_64
43static DEFINE_PER_CPU(unsigned char, is_idle); 63static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -69,8 +89,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
69 89
70 dst->thread.fpu_counter = 0; 90 dst->thread.fpu_counter = 0;
71 dst->thread.fpu.has_fpu = 0; 91 dst->thread.fpu.has_fpu = 0;
72 dst->thread.fpu.last_cpu = ~0;
73 dst->thread.fpu.state = NULL; 92 dst->thread.fpu.state = NULL;
93 task_disable_lazy_fpu_restore(dst);
74 if (tsk_used_math(src)) { 94 if (tsk_used_math(src)) {
75 int err = fpu_alloc(&dst->thread.fpu); 95 int err = fpu_alloc(&dst->thread.fpu);
76 if (err) 96 if (err)
@@ -109,7 +129,7 @@ void exit_thread(void)
109 unsigned long *bp = t->io_bitmap_ptr; 129 unsigned long *bp = t->io_bitmap_ptr;
110 130
111 if (bp) { 131 if (bp) {
112 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 132 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
113 133
114 t->io_bitmap_ptr = NULL; 134 t->io_bitmap_ptr = NULL;
115 clear_thread_flag(TIF_IO_BITMAP); 135 clear_thread_flag(TIF_IO_BITMAP);
@@ -131,13 +151,18 @@ void flush_thread(void)
131 151
132 flush_ptrace_hw_breakpoint(tsk); 152 flush_ptrace_hw_breakpoint(tsk);
133 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 153 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
134 drop_init_fpu(tsk); 154
135 /* 155 if (!use_eager_fpu()) {
136 * Free the FPU state for non xsave platforms. They get reallocated 156 /* FPU state will be reallocated lazily at the first use. */
137 * lazily at the first use. 157 drop_fpu(tsk);
138 */
139 if (!use_eager_fpu())
140 free_thread_xstate(tsk); 158 free_thread_xstate(tsk);
159 } else if (!used_math()) {
160 /* kthread execs. TODO: cleanup this horror. */
161 if (WARN_ON(init_fpu(tsk)))
162 force_sig(SIGKILL, tsk);
163 user_fpu_begin();
164 restore_init_xstate();
165 }
141} 166}
142 167
143static void hard_disable_TSC(void) 168static void hard_disable_TSC(void)
@@ -377,14 +402,11 @@ static void amd_e400_idle(void)
377 402
378 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 403 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
379 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 404 cpumask_set_cpu(cpu, amd_e400_c1e_mask);
380 /* 405 /* Force broadcast so ACPI can not interfere. */
381 * Force broadcast so ACPI can not interfere. 406 tick_broadcast_force();
382 */
383 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
384 &cpu);
385 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 407 pr_info("Switch to broadcast mode on CPU%d\n", cpu);
386 } 408 }
387 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 409 tick_broadcast_enter();
388 410
389 default_idle(); 411 default_idle();
390 412
@@ -393,12 +415,59 @@ static void amd_e400_idle(void)
393 * called with interrupts disabled. 415 * called with interrupts disabled.
394 */ 416 */
395 local_irq_disable(); 417 local_irq_disable();
396 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 418 tick_broadcast_exit();
397 local_irq_enable(); 419 local_irq_enable();
398 } else 420 } else
399 default_idle(); 421 default_idle();
400} 422}
401 423
424/*
425 * Intel Core2 and older machines prefer MWAIT over HALT for C1.
426 * We can't rely on cpuidle installing MWAIT, because it will not load
427 * on systems that support only C1 -- so the boot default must be MWAIT.
428 *
429 * Some AMD machines are the opposite, they depend on using HALT.
430 *
431 * So for default C1, which is used during boot until cpuidle loads,
432 * use MWAIT-C1 on Intel HW that has it, else use HALT.
433 */
434static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
435{
436 if (c->x86_vendor != X86_VENDOR_INTEL)
437 return 0;
438
439 if (!cpu_has(c, X86_FEATURE_MWAIT))
440 return 0;
441
442 return 1;
443}
444
445/*
446 * MONITOR/MWAIT with no hints, used for default default C1 state.
447 * This invokes MWAIT with interrutps enabled and no flags,
448 * which is backwards compatible with the original MWAIT implementation.
449 */
450
451static void mwait_idle(void)
452{
453 if (!current_set_polling_and_test()) {
454 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
455 smp_mb(); /* quirk */
456 clflush((void *)&current_thread_info()->flags);
457 smp_mb(); /* quirk */
458 }
459
460 __monitor((void *)&current_thread_info()->flags, 0, 0);
461 if (!need_resched())
462 __sti_mwait(0, 0);
463 else
464 local_irq_enable();
465 } else {
466 local_irq_enable();
467 }
468 __current_clr_polling();
469}
470
402void select_idle_routine(const struct cpuinfo_x86 *c) 471void select_idle_routine(const struct cpuinfo_x86 *c)
403{ 472{
404#ifdef CONFIG_SMP 473#ifdef CONFIG_SMP
@@ -412,6 +481,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
412 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 481 /* E400: APIC timer interrupt does not wake up CPU from C1e */
413 pr_info("using AMD E400 aware idle routine\n"); 482 pr_info("using AMD E400 aware idle routine\n");
414 x86_idle = amd_e400_idle; 483 x86_idle = amd_e400_idle;
484 } else if (prefer_mwait_c1_over_halt(c)) {
485 pr_info("using mwait in idle threads\n");
486 x86_idle = mwait_idle;
415 } else 487 } else
416 x86_idle = default_idle; 488 x86_idle = default_idle;
417} 489}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f99cb5a..8ed2106b06da 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all)
73 unsigned long sp; 73 unsigned long sp;
74 unsigned short ss, gs; 74 unsigned short ss, gs;
75 75
76 if (user_mode_vm(regs)) { 76 if (user_mode(regs)) {
77 sp = regs->sp; 77 sp = regs->sp;
78 ss = regs->ss & 0xffff; 78 ss = regs->ss & 0xffff;
79 gs = get_user_gs(regs); 79 gs = get_user_gs(regs);
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
206 regs->ip = new_ip; 206 regs->ip = new_ip;
207 regs->sp = new_sp; 207 regs->sp = new_sp;
208 regs->flags = X86_EFLAGS_IF; 208 regs->flags = X86_EFLAGS_IF;
209 /* 209 force_iret();
210 * force it to the iret return path by making it look as if there was
211 * some work pending.
212 */
213 set_thread_flag(TIF_NOTIFY_RESUME);
214} 210}
215EXPORT_SYMBOL_GPL(start_thread); 211EXPORT_SYMBOL_GPL(start_thread);
216 212
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
248 struct thread_struct *prev = &prev_p->thread, 244 struct thread_struct *prev = &prev_p->thread,
249 *next = &next_p->thread; 245 *next = &next_p->thread;
250 int cpu = smp_processor_id(); 246 int cpu = smp_processor_id();
251 struct tss_struct *tss = &per_cpu(init_tss, cpu); 247 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
252 fpu_switch_t fpu; 248 fpu_switch_t fpu;
253 249
254 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 250 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
256 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 252 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
257 253
258 /* 254 /*
259 * Reload esp0.
260 */
261 load_sp0(tss, next);
262
263 /*
264 * Save away %gs. No need to save %fs, as it was saved on the 255 * Save away %gs. No need to save %fs, as it was saved on the
265 * stack on entry. No need to save %es and %ds, as those are 256 * stack on entry. No need to save %es and %ds, as those are
266 * always kernel segments while inside the kernel. Doing this 257 * always kernel segments while inside the kernel. Doing this
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
310 */ 301 */
311 arch_end_context_switch(next_p); 302 arch_end_context_switch(next_p);
312 303
304 /*
305 * Reload esp0, kernel_stack, and current_top_of_stack. This changes
306 * current_thread_info().
307 */
308 load_sp0(tss, next);
313 this_cpu_write(kernel_stack, 309 this_cpu_write(kernel_stack,
314 (unsigned long)task_stack_page(next_p) + 310 (unsigned long)task_stack_page(next_p) +
315 THREAD_SIZE - KERNEL_STACK_OFFSET); 311 THREAD_SIZE);
312 this_cpu_write(cpu_current_top_of_stack,
313 (unsigned long)task_stack_page(next_p) +
314 THREAD_SIZE);
316 315
317 /* 316 /*
318 * Restore %gs if needed (which is common) 317 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43577d2..4baaa972f52a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
52 52
53asmlinkage extern void ret_from_fork(void); 53asmlinkage extern void ret_from_fork(void);
54 54
55__visible DEFINE_PER_CPU(unsigned long, old_rsp); 55__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
56 56
57/* Prints also some state that isn't saved in the pt_regs */ 57/* Prints also some state that isn't saved in the pt_regs */
58void __show_regs(struct pt_regs *regs, int all) 58void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
161 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; 161 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162 childregs = task_pt_regs(p); 162 childregs = task_pt_regs(p);
163 p->thread.sp = (unsigned long) childregs; 163 p->thread.sp = (unsigned long) childregs;
164 p->thread.usersp = me->thread.usersp;
165 set_tsk_thread_flag(p, TIF_FORK); 164 set_tsk_thread_flag(p, TIF_FORK);
166 p->thread.io_bitmap_ptr = NULL; 165 p->thread.io_bitmap_ptr = NULL;
167 166
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
207 */ 206 */
208 if (clone_flags & CLONE_SETTLS) { 207 if (clone_flags & CLONE_SETTLS) {
209#ifdef CONFIG_IA32_EMULATION 208#ifdef CONFIG_IA32_EMULATION
210 if (test_thread_flag(TIF_IA32)) 209 if (is_ia32_task())
211 err = do_set_thread_area(p, -1, 210 err = do_set_thread_area(p, -1,
212 (struct user_desc __user *)childregs->si, 0); 211 (struct user_desc __user *)childregs->si, 0);
213 else 212 else
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
235 loadsegment(es, _ds); 234 loadsegment(es, _ds);
236 loadsegment(ds, _ds); 235 loadsegment(ds, _ds);
237 load_gs_index(0); 236 load_gs_index(0);
238 current->thread.usersp = new_sp;
239 regs->ip = new_ip; 237 regs->ip = new_ip;
240 regs->sp = new_sp; 238 regs->sp = new_sp;
241 this_cpu_write(old_rsp, new_sp);
242 regs->cs = _cs; 239 regs->cs = _cs;
243 regs->ss = _ss; 240 regs->ss = _ss;
244 regs->flags = X86_EFLAGS_IF; 241 regs->flags = X86_EFLAGS_IF;
242 force_iret();
245} 243}
246 244
247void 245void
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
277 struct thread_struct *prev = &prev_p->thread; 275 struct thread_struct *prev = &prev_p->thread;
278 struct thread_struct *next = &next_p->thread; 276 struct thread_struct *next = &next_p->thread;
279 int cpu = smp_processor_id(); 277 int cpu = smp_processor_id();
280 struct tss_struct *tss = &per_cpu(init_tss, cpu); 278 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
281 unsigned fsindex, gsindex; 279 unsigned fsindex, gsindex;
282 fpu_switch_t fpu; 280 fpu_switch_t fpu;
283 281
284 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 282 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
285 283
286 /* Reload esp0 and ss1. */
287 load_sp0(tss, next);
288
289 /* We must save %fs and %gs before load_TLS() because 284 /* We must save %fs and %gs before load_TLS() because
290 * %fs and %gs may be cleared by load_TLS(). 285 * %fs and %gs may be cleared by load_TLS().
291 * 286 *
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
401 /* 396 /*
402 * Switch the PDA and FPU contexts. 397 * Switch the PDA and FPU contexts.
403 */ 398 */
404 prev->usersp = this_cpu_read(old_rsp);
405 this_cpu_write(old_rsp, next->usersp);
406 this_cpu_write(current_task, next_p); 399 this_cpu_write(current_task, next_p);
407 400
408 /* 401 /*
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
413 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); 406 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
414 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); 407 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
415 408
409 /* Reload esp0 and ss1. This changes current_thread_info(). */
410 load_sp0(tss, next);
411
416 this_cpu_write(kernel_stack, 412 this_cpu_write(kernel_stack,
417 (unsigned long)task_stack_page(next_p) + 413 (unsigned long)task_stack_page(next_p) + THREAD_SIZE);
418 THREAD_SIZE - KERNEL_STACK_OFFSET);
419 414
420 /* 415 /*
421 * Now maybe reload the debug registers and handle I/O bitmaps 416 * Now maybe reload the debug registers and handle I/O bitmaps
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr)
602 597
603unsigned long KSTK_ESP(struct task_struct *task) 598unsigned long KSTK_ESP(struct task_struct *task)
604{ 599{
605 return (test_tsk_thread_flag(task, TIF_IA32)) ? 600 return task_pt_regs(task)->sp;
606 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
607} 601}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618b2e91..a7bc79480719 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
364 case offsetof(struct user_regs_struct,cs): 364 case offsetof(struct user_regs_struct,cs):
365 if (unlikely(value == 0)) 365 if (unlikely(value == 0))
366 return -EIO; 366 return -EIO;
367#ifdef CONFIG_IA32_EMULATION 367 task_pt_regs(task)->cs = value;
368 if (test_tsk_thread_flag(task, TIF_IA32))
369 task_pt_regs(task)->cs = value;
370#endif
371 break; 368 break;
372 case offsetof(struct user_regs_struct,ss): 369 case offsetof(struct user_regs_struct,ss):
373 if (unlikely(value == 0)) 370 if (unlikely(value == 0))
374 return -EIO; 371 return -EIO;
375#ifdef CONFIG_IA32_EMULATION 372 task_pt_regs(task)->ss = value;
376 if (test_tsk_thread_flag(task, TIF_IA32))
377 task_pt_regs(task)->ss = value;
378#endif
379 break; 373 break;
380 } 374 }
381 375
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
1421 memset(info, 0, sizeof(*info)); 1415 memset(info, 0, sizeof(*info));
1422 info->si_signo = SIGTRAP; 1416 info->si_signo = SIGTRAP;
1423 info->si_code = si_code; 1417 info->si_code = si_code;
1424 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; 1418 info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
1425} 1419}
1426 1420
1427void user_single_step_siginfo(struct task_struct *tsk, 1421void user_single_step_siginfo(struct task_struct *tsk,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d229a58..e5ecd20e72dd 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
141 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 141 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
142} 142}
143 143
144static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
145
146static struct pvclock_vsyscall_time_info *
147pvclock_get_vsyscall_user_time_info(int cpu)
148{
149 if (!pvclock_vdso_info) {
150 BUG();
151 return NULL;
152 }
153
154 return &pvclock_vdso_info[cpu];
155}
156
157struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
158{
159 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
160}
161
144#ifdef CONFIG_X86_64 162#ifdef CONFIG_X86_64
163static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
164 void *v)
165{
166 struct task_migration_notifier *mn = v;
167 struct pvclock_vsyscall_time_info *pvti;
168
169 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
170
171 /* this is NULL when pvclock vsyscall is not initialized */
172 if (unlikely(pvti == NULL))
173 return NOTIFY_DONE;
174
175 pvti->migrate_count++;
176
177 return NOTIFY_DONE;
178}
179
180static struct notifier_block pvclock_migrate = {
181 .notifier_call = pvclock_task_migrate,
182};
183
145/* 184/*
146 * Initialize the generic pvclock vsyscall state. This will allocate 185 * Initialize the generic pvclock vsyscall state. This will allocate
147 * a/some page(s) for the per-vcpu pvclock information, set up a 186 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
155 194
156 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); 195 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
157 196
197 pvclock_vdso_info = i;
198
158 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 199 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
159 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 200 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
160 __pa(i) + (idx*PAGE_SIZE), 201 __pa(i) + (idx*PAGE_SIZE),
161 PAGE_KERNEL_VVAR); 202 PAGE_KERNEL_VVAR);
162 } 203 }
163 204
205
206 register_task_migration_notifier(&pvclock_migrate);
207
164 return 0; 208 return 0;
165} 209}
166#endif 210#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index bae6c609888e..86db4bcd7ce5 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -183,6 +183,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
183 }, 183 },
184 }, 184 },
185 185
186 /* ASRock */
187 { /* Handle problems with rebooting on ASRock Q1900DC-ITX */
188 .callback = set_pci_reboot,
189 .ident = "ASRock Q1900DC-ITX",
190 .matches = {
191 DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"),
192 DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"),
193 },
194 },
195
186 /* ASUS */ 196 /* ASUS */
187 { /* Handle problems with rebooting on ASUS P4S800 */ 197 { /* Handle problems with rebooting on ASUS P4S800 */
188 .callback = set_bios_reboot, 198 .callback = set_bios_reboot,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7c22a6..77630d57e7bf 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
226 movl (%ebx), %ecx 226 movl (%ebx), %ecx
227 addl $4, %ebx 227 addl $4, %ebx
2281: 2281:
229 testl $0x1, %ecx /* is it a destination page */ 229 testb $0x1, %cl /* is it a destination page */
230 jz 2f 230 jz 2f
231 movl %ecx, %edi 231 movl %ecx, %edi
232 andl $0xfffff000, %edi 232 andl $0xfffff000, %edi
233 jmp 0b 233 jmp 0b
2342: 2342:
235 testl $0x2, %ecx /* is it an indirection page */ 235 testb $0x2, %cl /* is it an indirection page */
236 jz 2f 236 jz 2f
237 movl %ecx, %ebx 237 movl %ecx, %ebx
238 andl $0xfffff000, %ebx 238 andl $0xfffff000, %ebx
239 jmp 0b 239 jmp 0b
2402: 2402:
241 testl $0x4, %ecx /* is it the done indicator */ 241 testb $0x4, %cl /* is it the done indicator */
242 jz 2f 242 jz 2f
243 jmp 3f 243 jmp 3f
2442: 2442:
245 testl $0x8, %ecx /* is it the source indicator */ 245 testb $0x8, %cl /* is it the source indicator */
246 jz 0b /* Ignore it otherwise */ 246 jz 0b /* Ignore it otherwise */
247 movl %ecx, %esi /* For every source page do a copy */ 247 movl %ecx, %esi /* For every source page do a copy */
248 andl $0xfffff000, %esi 248 andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c693e475..98111b38ebfd 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -123,7 +123,7 @@ identity_mapped:
123 * Set cr4 to a known state: 123 * Set cr4 to a known state:
124 * - physical address extension enabled 124 * - physical address extension enabled
125 */ 125 */
126 movq $X86_CR4_PAE, %rax 126 movl $X86_CR4_PAE, %eax
127 movq %rax, %cr4 127 movq %rax, %cr4
128 128
129 jmp 1f 129 jmp 1f
@@ -221,23 +221,23 @@ swap_pages:
221 movq (%rbx), %rcx 221 movq (%rbx), %rcx
222 addq $8, %rbx 222 addq $8, %rbx
2231: 2231:
224 testq $0x1, %rcx /* is it a destination page? */ 224 testb $0x1, %cl /* is it a destination page? */
225 jz 2f 225 jz 2f
226 movq %rcx, %rdi 226 movq %rcx, %rdi
227 andq $0xfffffffffffff000, %rdi 227 andq $0xfffffffffffff000, %rdi
228 jmp 0b 228 jmp 0b
2292: 2292:
230 testq $0x2, %rcx /* is it an indirection page? */ 230 testb $0x2, %cl /* is it an indirection page? */
231 jz 2f 231 jz 2f
232 movq %rcx, %rbx 232 movq %rcx, %rbx
233 andq $0xfffffffffffff000, %rbx 233 andq $0xfffffffffffff000, %rbx
234 jmp 0b 234 jmp 0b
2352: 2352:
236 testq $0x4, %rcx /* is it the done indicator? */ 236 testb $0x4, %cl /* is it the done indicator? */
237 jz 2f 237 jz 2f
238 jmp 3f 238 jmp 3f
2392: 2392:
240 testq $0x8, %rcx /* is it the source indicator? */ 240 testb $0x8, %cl /* is it the source indicator? */
241 jz 0b /* Ignore it otherwise */ 241 jz 0b /* Ignore it otherwise */
242 movq %rcx, %rsi /* For ever source page do a copy */ 242 movq %rcx, %rsi /* For ever source page do a copy */
243 andq $0xfffffffffffff000, %rsi 243 andq $0xfffffffffffff000, %rsi
@@ -246,17 +246,17 @@ swap_pages:
246 movq %rsi, %rax 246 movq %rsi, %rax
247 247
248 movq %r10, %rdi 248 movq %r10, %rdi
249 movq $512, %rcx 249 movl $512, %ecx
250 rep ; movsq 250 rep ; movsq
251 251
252 movq %rax, %rdi 252 movq %rax, %rdi
253 movq %rdx, %rsi 253 movq %rdx, %rsi
254 movq $512, %rcx 254 movl $512, %ecx
255 rep ; movsq 255 rep ; movsq
256 256
257 movq %rdx, %rdi 257 movq %rdx, %rdi
258 movq %r10, %rsi 258 movq %r10, %rsi
259 movq $512, %rcx 259 movl $512, %ecx
260 rep ; movsq 260 rep ; movsq
261 261
262 lea PAGE_SIZE(%rax), %rsi 262 lea PAGE_SIZE(%rax), %rsi
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a2421cca01f..d74ac33290ae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -354,7 +354,7 @@ static void __init relocate_initrd(void)
354 mapaddr = ramdisk_image & PAGE_MASK; 354 mapaddr = ramdisk_image & PAGE_MASK;
355 p = early_memremap(mapaddr, clen+slop); 355 p = early_memremap(mapaddr, clen+slop);
356 memcpy(q, p+slop, clen); 356 memcpy(q, p+slop, clen);
357 early_iounmap(p, clen+slop); 357 early_memunmap(p, clen+slop);
358 q += clen; 358 q += clen;
359 ramdisk_image += clen; 359 ramdisk_image += clen;
360 ramdisk_size -= clen; 360 ramdisk_size -= clen;
@@ -438,7 +438,7 @@ static void __init parse_setup_data(void)
438 data_len = data->len + sizeof(struct setup_data); 438 data_len = data->len + sizeof(struct setup_data);
439 data_type = data->type; 439 data_type = data->type;
440 pa_next = data->next; 440 pa_next = data->next;
441 early_iounmap(data, sizeof(*data)); 441 early_memunmap(data, sizeof(*data));
442 442
443 switch (data_type) { 443 switch (data_type) {
444 case SETUP_E820_EXT: 444 case SETUP_E820_EXT:
@@ -470,7 +470,7 @@ static void __init e820_reserve_setup_data(void)
470 E820_RAM, E820_RESERVED_KERN); 470 E820_RAM, E820_RESERVED_KERN);
471 found = 1; 471 found = 1;
472 pa_data = data->next; 472 pa_data = data->next;
473 early_iounmap(data, sizeof(*data)); 473 early_memunmap(data, sizeof(*data));
474 } 474 }
475 if (!found) 475 if (!found)
476 return; 476 return;
@@ -491,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
491 data = early_memremap(pa_data, sizeof(*data)); 491 data = early_memremap(pa_data, sizeof(*data));
492 memblock_reserve(pa_data, sizeof(*data) + data->len); 492 memblock_reserve(pa_data, sizeof(*data) + data->len);
493 pa_data = data->next; 493 pa_data = data->next;
494 early_iounmap(data, sizeof(*data)); 494 early_memunmap(data, sizeof(*data));
495 } 495 }
496} 496}
497 497
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void)
832static int 832static int
833dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) 833dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
834{ 834{
835 pr_emerg("Kernel Offset: 0x%lx from 0x%lx " 835 if (kaslr_enabled()) {
836 "(relocation range: 0x%lx-0x%lx)\n", 836 pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
837 (unsigned long)&_text - __START_KERNEL, __START_KERNEL, 837 (unsigned long)&_text - __START_KERNEL,
838 __START_KERNEL_map, MODULES_VADDR-1); 838 __START_KERNEL,
839 __START_KERNEL_map,
840 MODULES_VADDR-1);
841 } else {
842 pr_emerg("Kernel Offset: disabled\n");
843 }
839 844
840 return 0; 845 return 0;
841} 846}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..f9804080ccb3 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,8 +61,7 @@
61 regs->seg = GET_SEG(seg) | 3; \ 61 regs->seg = GET_SEG(seg) | 3; \
62} while (0) 62} while (0)
63 63
64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
65 unsigned long *pax)
66{ 65{
67 void __user *buf; 66 void __user *buf;
68 unsigned int tmpflags; 67 unsigned int tmpflags;
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
81#endif /* CONFIG_X86_32 */ 80#endif /* CONFIG_X86_32 */
82 81
83 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 82 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
84 COPY(dx); COPY(cx); COPY(ip); 83 COPY(dx); COPY(cx); COPY(ip); COPY(ax);
85 84
86#ifdef CONFIG_X86_64 85#ifdef CONFIG_X86_64
87 COPY(r8); 86 COPY(r8);
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
94 COPY(r15); 93 COPY(r15);
95#endif /* CONFIG_X86_64 */ 94#endif /* CONFIG_X86_64 */
96 95
97#ifdef CONFIG_X86_32
98 COPY_SEG_CPL3(cs); 96 COPY_SEG_CPL3(cs);
99 COPY_SEG_CPL3(ss); 97 COPY_SEG_CPL3(ss);
100#else /* !CONFIG_X86_32 */
101 /* Kernel saves and restores only the CS segment register on signals,
102 * which is the bare minimum needed to allow mixed 32/64-bit code.
103 * App's signal handler can save/restore other segments if needed. */
104 COPY_SEG_CPL3(cs);
105#endif /* CONFIG_X86_32 */
106 98
107 get_user_ex(tmpflags, &sc->flags); 99 get_user_ex(tmpflags, &sc->flags);
108 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 100 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
109 regs->orig_ax = -1; /* disable syscall checks */ 101 regs->orig_ax = -1; /* disable syscall checks */
110 102
111 get_user_ex(buf, &sc->fpstate); 103 get_user_ex(buf, &sc->fpstate);
112
113 get_user_ex(*pax, &sc->ax);
114 } get_user_catch(err); 104 } get_user_catch(err);
115 105
116 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); 106 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
117 107
108 force_iret();
109
118 return err; 110 return err;
119} 111}
120 112
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
162#else /* !CONFIG_X86_32 */ 154#else /* !CONFIG_X86_32 */
163 put_user_ex(regs->flags, &sc->flags); 155 put_user_ex(regs->flags, &sc->flags);
164 put_user_ex(regs->cs, &sc->cs); 156 put_user_ex(regs->cs, &sc->cs);
165 put_user_ex(0, &sc->gs); 157 put_user_ex(0, &sc->__pad2);
166 put_user_ex(0, &sc->fs); 158 put_user_ex(0, &sc->__pad1);
159 put_user_ex(regs->ss, &sc->ss);
167#endif /* CONFIG_X86_32 */ 160#endif /* CONFIG_X86_32 */
168 161
169 put_user_ex(fpstate, &sc->fpstate); 162 put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
457 450
458 regs->sp = (unsigned long)frame; 451 regs->sp = (unsigned long)frame;
459 452
460 /* Set up the CS register to run signal handlers in 64-bit mode, 453 /*
461 even if the handler happens to be interrupting 32-bit code. */ 454 * Set up the CS and SS registers to run signal handlers in
455 * 64-bit mode, even if the handler happens to be interrupting
456 * 32-bit or 16-bit code.
457 *
458 * SS is subtle. In 64-bit mode, we don't need any particular
459 * SS descriptor, but we do need SS to be valid. It's possible
460 * that the old SS is entirely bogus -- this can happen if the
461 * signal we're trying to deliver is #GP or #SS caused by a bad
462 * SS value.
463 */
462 regs->cs = __USER_CS; 464 regs->cs = __USER_CS;
465 regs->ss = __USER_DS;
463 466
464 return 0; 467 return 0;
465} 468}
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void)
539{ 542{
540 struct pt_regs *regs = current_pt_regs(); 543 struct pt_regs *regs = current_pt_regs();
541 struct sigframe __user *frame; 544 struct sigframe __user *frame;
542 unsigned long ax;
543 sigset_t set; 545 sigset_t set;
544 546
545 frame = (struct sigframe __user *)(regs->sp - 8); 547 frame = (struct sigframe __user *)(regs->sp - 8);
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void)
553 555
554 set_current_blocked(&set); 556 set_current_blocked(&set);
555 557
556 if (restore_sigcontext(regs, &frame->sc, &ax)) 558 if (restore_sigcontext(regs, &frame->sc))
557 goto badframe; 559 goto badframe;
558 return ax; 560 return regs->ax;
559 561
560badframe: 562badframe:
561 signal_fault(regs, frame, "sigreturn"); 563 signal_fault(regs, frame, "sigreturn");
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void)
568{ 570{
569 struct pt_regs *regs = current_pt_regs(); 571 struct pt_regs *regs = current_pt_regs();
570 struct rt_sigframe __user *frame; 572 struct rt_sigframe __user *frame;
571 unsigned long ax;
572 sigset_t set; 573 sigset_t set;
573 574
574 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 575 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
@@ -579,37 +580,23 @@ asmlinkage long sys_rt_sigreturn(void)
579 580
580 set_current_blocked(&set); 581 set_current_blocked(&set);
581 582
582 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 583 if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
583 goto badframe; 584 goto badframe;
584 585
585 if (restore_altstack(&frame->uc.uc_stack)) 586 if (restore_altstack(&frame->uc.uc_stack))
586 goto badframe; 587 goto badframe;
587 588
588 return ax; 589 return regs->ax;
589 590
590badframe: 591badframe:
591 signal_fault(regs, frame, "rt_sigreturn"); 592 signal_fault(regs, frame, "rt_sigreturn");
592 return 0; 593 return 0;
593} 594}
594 595
595/*
596 * OK, we're invoking a handler:
597 */
598static int signr_convert(int sig)
599{
600#ifdef CONFIG_X86_32
601 struct thread_info *info = current_thread_info();
602
603 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
604 return info->exec_domain->signal_invmap[sig];
605#endif /* CONFIG_X86_32 */
606 return sig;
607}
608
609static int 596static int
610setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) 597setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
611{ 598{
612 int usig = signr_convert(ksig->sig); 599 int usig = ksig->sig;
613 sigset_t *set = sigmask_to_save(); 600 sigset_t *set = sigmask_to_save();
614 compat_sigset_t *cset = (compat_sigset_t *) set; 601 compat_sigset_t *cset = (compat_sigset_t *) set;
615 602
@@ -679,7 +666,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
679 * Ensure the signal handler starts with the new fpu state. 666 * Ensure the signal handler starts with the new fpu state.
680 */ 667 */
681 if (used_math()) 668 if (used_math())
682 drop_init_fpu(current); 669 fpu_reset_state(current);
683 } 670 }
684 signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); 671 signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
685} 672}
@@ -780,7 +767,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
780 struct pt_regs *regs = current_pt_regs(); 767 struct pt_regs *regs = current_pt_regs();
781 struct rt_sigframe_x32 __user *frame; 768 struct rt_sigframe_x32 __user *frame;
782 sigset_t set; 769 sigset_t set;
783 unsigned long ax;
784 770
785 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); 771 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
786 772
@@ -791,13 +777,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
791 777
792 set_current_blocked(&set); 778 set_current_blocked(&set);
793 779
794 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 780 if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
795 goto badframe; 781 goto badframe;
796 782
797 if (compat_restore_altstack(&frame->uc.uc_stack)) 783 if (compat_restore_altstack(&frame->uc.uc_stack))
798 goto badframe; 784 goto badframe;
799 785
800 return ax; 786 return regs->ax;
801 787
802badframe: 788badframe:
803 signal_fault(regs, frame, "x32 rt_sigreturn"); 789 signal_fault(regs, frame, "x32 rt_sigreturn");
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..50e547eac8cd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
77#include <asm/realmode.h> 77#include <asm/realmode.h>
78#include <asm/misc.h> 78#include <asm/misc.h>
79 79
80/* State of each CPU */
81DEFINE_PER_CPU(int, cpu_state) = { 0 };
82
83/* Number of siblings per CPU package */ 80/* Number of siblings per CPU package */
84int smp_num_siblings = 1; 81int smp_num_siblings = 1;
85EXPORT_SYMBOL(smp_num_siblings); 82EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
257 lock_vector_lock(); 254 lock_vector_lock();
258 set_cpu_online(smp_processor_id(), true); 255 set_cpu_online(smp_processor_id(), true);
259 unlock_vector_lock(); 256 unlock_vector_lock();
260 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 257 cpu_set_state_online(smp_processor_id());
261 x86_platform.nmi_init(); 258 x86_platform.nmi_init();
262 259
263 /* enable local interrupts */ 260 /* enable local interrupts */
@@ -779,6 +776,26 @@ out:
779 return boot_error; 776 return boot_error;
780} 777}
781 778
779void common_cpu_up(unsigned int cpu, struct task_struct *idle)
780{
781 /* Just in case we booted with a single CPU. */
782 alternatives_enable_smp();
783
784 per_cpu(current_task, cpu) = idle;
785
786#ifdef CONFIG_X86_32
787 /* Stack for startup_32 can be just as for start_secondary onwards */
788 irq_ctx_init(cpu);
789 per_cpu(cpu_current_top_of_stack, cpu) =
790 (unsigned long)task_stack_page(idle) + THREAD_SIZE;
791#else
792 clear_tsk_thread_flag(idle, TIF_FORK);
793 initial_gs = per_cpu_offset(cpu);
794#endif
795 per_cpu(kernel_stack, cpu) =
796 (unsigned long)task_stack_page(idle) + THREAD_SIZE;
797}
798
782/* 799/*
783 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 800 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
784 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 801 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -796,23 +813,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
796 int cpu0_nmi_registered = 0; 813 int cpu0_nmi_registered = 0;
797 unsigned long timeout; 814 unsigned long timeout;
798 815
799 /* Just in case we booted with a single CPU. */
800 alternatives_enable_smp();
801
802 idle->thread.sp = (unsigned long) (((struct pt_regs *) 816 idle->thread.sp = (unsigned long) (((struct pt_regs *)
803 (THREAD_SIZE + task_stack_page(idle))) - 1); 817 (THREAD_SIZE + task_stack_page(idle))) - 1);
804 per_cpu(current_task, cpu) = idle;
805 818
806#ifdef CONFIG_X86_32
807 /* Stack for startup_32 can be just as for start_secondary onwards */
808 irq_ctx_init(cpu);
809#else
810 clear_tsk_thread_flag(idle, TIF_FORK);
811 initial_gs = per_cpu_offset(cpu);
812#endif
813 per_cpu(kernel_stack, cpu) =
814 (unsigned long)task_stack_page(idle) -
815 KERNEL_STACK_OFFSET + THREAD_SIZE;
816 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 819 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
817 initial_code = (unsigned long)start_secondary; 820 initial_code = (unsigned long)start_secondary;
818 stack_start = idle->thread.sp; 821 stack_start = idle->thread.sp;
@@ -948,11 +951,16 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
948 */ 951 */
949 mtrr_save_state(); 952 mtrr_save_state();
950 953
951 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 954 /* x86 CPUs take themselves offline, so delayed offline is OK. */
955 err = cpu_check_up_prepare(cpu);
956 if (err && err != -EBUSY)
957 return err;
952 958
953 /* the FPU context is blank, nobody can own it */ 959 /* the FPU context is blank, nobody can own it */
954 __cpu_disable_lazy_restore(cpu); 960 __cpu_disable_lazy_restore(cpu);
955 961
962 common_cpu_up(cpu, tidle);
963
956 err = do_boot_cpu(apicid, cpu, tidle); 964 err = do_boot_cpu(apicid, cpu, tidle);
957 if (err) { 965 if (err) {
958 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); 966 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
@@ -1086,8 +1094,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
1086 return SMP_NO_APIC; 1094 return SMP_NO_APIC;
1087 } 1095 }
1088 1096
1089 verify_local_APIC();
1090
1091 /* 1097 /*
1092 * If SMP should be disabled, then really disable it! 1098 * If SMP should be disabled, then really disable it!
1093 */ 1099 */
@@ -1191,7 +1197,7 @@ void __init native_smp_prepare_boot_cpu(void)
1191 switch_to_new_gdt(me); 1197 switch_to_new_gdt(me);
1192 /* already set me in cpu_online_mask in boot_cpu_init() */ 1198 /* already set me in cpu_online_mask in boot_cpu_init() */
1193 cpumask_set_cpu(me, cpu_callout_mask); 1199 cpumask_set_cpu(me, cpu_callout_mask);
1194 per_cpu(cpu_state, me) = CPU_ONLINE; 1200 cpu_set_state_online(me);
1195} 1201}
1196 1202
1197void __init native_smp_cpus_done(unsigned int max_cpus) 1203void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1324,10 @@ static void __ref remove_cpu_from_maps(int cpu)
1318 numa_remove_cpu(cpu); 1324 numa_remove_cpu(cpu);
1319} 1325}
1320 1326
1321static DEFINE_PER_CPU(struct completion, die_complete);
1322
1323void cpu_disable_common(void) 1327void cpu_disable_common(void)
1324{ 1328{
1325 int cpu = smp_processor_id(); 1329 int cpu = smp_processor_id();
1326 1330
1327 init_completion(&per_cpu(die_complete, smp_processor_id()));
1328
1329 remove_siblinginfo(cpu); 1331 remove_siblinginfo(cpu);
1330 1332
1331 /* It's now safe to remove this processor from the online map */ 1333 /* It's now safe to remove this processor from the online map */
@@ -1349,24 +1351,27 @@ int native_cpu_disable(void)
1349 return 0; 1351 return 0;
1350} 1352}
1351 1353
1352void cpu_die_common(unsigned int cpu) 1354int common_cpu_die(unsigned int cpu)
1353{ 1355{
1354 wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); 1356 int ret = 0;
1355}
1356 1357
1357void native_cpu_die(unsigned int cpu)
1358{
1359 /* We don't do anything here: idle task is faking death itself. */ 1358 /* We don't do anything here: idle task is faking death itself. */
1360 1359
1361 cpu_die_common(cpu);
1362
1363 /* They ack this in play_dead() by setting CPU_DEAD */ 1360 /* They ack this in play_dead() by setting CPU_DEAD */
1364 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1361 if (cpu_wait_death(cpu, 5)) {
1365 if (system_state == SYSTEM_RUNNING) 1362 if (system_state == SYSTEM_RUNNING)
1366 pr_info("CPU %u is now offline\n", cpu); 1363 pr_info("CPU %u is now offline\n", cpu);
1367 } else { 1364 } else {
1368 pr_err("CPU %u didn't die...\n", cpu); 1365 pr_err("CPU %u didn't die...\n", cpu);
1366 ret = -1;
1369 } 1367 }
1368
1369 return ret;
1370}
1371
1372void native_cpu_die(unsigned int cpu)
1373{
1374 common_cpu_die(cpu);
1370} 1375}
1371 1376
1372void play_dead_common(void) 1377void play_dead_common(void)
@@ -1375,10 +1380,8 @@ void play_dead_common(void)
1375 reset_lazy_tlbstate(); 1380 reset_lazy_tlbstate();
1376 amd_e400_remove_cpu(raw_smp_processor_id()); 1381 amd_e400_remove_cpu(raw_smp_processor_id());
1377 1382
1378 mb();
1379 /* Ack it */ 1383 /* Ack it */
1380 __this_cpu_write(cpu_state, CPU_DEAD); 1384 (void)cpu_report_death();
1381 complete(&per_cpu(die_complete, smp_processor_id()));
1382 1385
1383 /* 1386 /*
1384 * With physical CPU hotplug, we should halt the cpu 1387 * With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 30277e27431a..10e0272d789a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -34,10 +34,26 @@ static unsigned long get_align_mask(void)
34 return va_align.mask; 34 return va_align.mask;
35} 35}
36 36
37/*
38 * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
39 * va_align.bits, [12:upper_bit), are set to a random value instead of
40 * zeroing them. This random value is computed once per boot. This form
41 * of ASLR is known as "per-boot ASLR".
42 *
43 * To achieve this, the random value is added to the info.align_offset
44 * value before calling vm_unmapped_area() or ORed directly to the
45 * address.
46 */
47static unsigned long get_align_bits(void)
48{
49 return va_align.bits & get_align_mask();
50}
51
37unsigned long align_vdso_addr(unsigned long addr) 52unsigned long align_vdso_addr(unsigned long addr)
38{ 53{
39 unsigned long align_mask = get_align_mask(); 54 unsigned long align_mask = get_align_mask();
40 return (addr + align_mask) & ~align_mask; 55 addr = (addr + align_mask) & ~align_mask;
56 return addr | get_align_bits();
41} 57}
42 58
43static int __init control_va_addr_alignment(char *str) 59static int __init control_va_addr_alignment(char *str)
@@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
135 info.length = len; 151 info.length = len;
136 info.low_limit = begin; 152 info.low_limit = begin;
137 info.high_limit = end; 153 info.high_limit = end;
138 info.align_mask = filp ? get_align_mask() : 0; 154 info.align_mask = 0;
139 info.align_offset = pgoff << PAGE_SHIFT; 155 info.align_offset = pgoff << PAGE_SHIFT;
156 if (filp) {
157 info.align_mask = get_align_mask();
158 info.align_offset += get_align_bits();
159 }
140 return vm_unmapped_area(&info); 160 return vm_unmapped_area(&info);
141} 161}
142 162
@@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
174 info.length = len; 194 info.length = len;
175 info.low_limit = PAGE_SIZE; 195 info.low_limit = PAGE_SIZE;
176 info.high_limit = mm->mmap_base; 196 info.high_limit = mm->mmap_base;
177 info.align_mask = filp ? get_align_mask() : 0; 197 info.align_mask = 0;
178 info.align_offset = pgoff << PAGE_SHIFT; 198 info.align_offset = pgoff << PAGE_SHIFT;
199 if (filp) {
200 info.align_mask = get_align_mask();
201 info.align_offset += get_align_bits();
202 }
179 addr = vm_unmapped_area(&info); 203 addr = vm_unmapped_area(&info);
180 if (!(addr & ~PAGE_MASK)) 204 if (!(addr & ~PAGE_MASK))
181 return addr; 205 return addr;
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57d8a9e..3777189c4a19 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; 8#ifdef CONFIG_IA32_EMULATION
9#define SYM(sym, compat) compat
10#else
11#define SYM(sym, compat) sym
12#define ia32_sys_call_table sys_call_table
13#define __NR_ia32_syscall_max __NR_syscall_max
14#endif
15
16#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
9#include <asm/syscalls_32.h> 17#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386 18#undef __SYSCALL_I386
11 19
12#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, 20#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
13 21
14typedef asmlinkage void (*sys_call_ptr_t)(void); 22typedef asmlinkage void (*sys_call_ptr_t)(void);
15 23
16extern asmlinkage void sys_ni_syscall(void); 24extern asmlinkage void sys_ni_syscall(void);
17 25
18__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 26__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /* 27 /*
20 * Smells like a compiler bug -- it doesn't work 28 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed. 29 * when the & below is removed.
22 */ 30 */
23 [0 ... __NR_syscall_max] = &sys_ni_syscall, 31 [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
24#include <asm/syscalls_32.h> 32#include <asm/syscalls_32.h>
25}; 33};
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133abda48..5ecbfe5099da 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -57,7 +57,7 @@ int rodata_test(void)
57 /* test 3: check the value hasn't changed */ 57 /* test 3: check the value hasn't changed */
58 /* If this test fails, we managed to overwrite the data */ 58 /* If this test fails, we managed to overwrite the data */
59 if (!rodata_test_data) { 59 if (!rodata_test_data) {
60 printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); 60 printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
61 return -ENODEV; 61 return -ENODEV;
62 } 62 }
63 /* test 4: check if the rodata section is 4Kb aligned */ 63 /* test 4: check if the rodata section is 4Kb aligned */
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25adc0e16eaa..d39c09119db6 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs)
30{ 30{
31 unsigned long pc = instruction_pointer(regs); 31 unsigned long pc = instruction_pointer(regs);
32 32
33 if (!user_mode_vm(regs) && in_lock_functions(pc)) { 33 if (!user_mode(regs) && in_lock_functions(pc)) {
34#ifdef CONFIG_FRAME_POINTER 34#ifdef CONFIG_FRAME_POINTER
35 return *(unsigned long *)(regs->bp + sizeof(long)); 35 return *(unsigned long *)(regs->bp + sizeof(long));
36#else 36#else
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4ff5d162ff9f..324ab5247687 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
112{ 112{
113 enum ctx_state prev_state; 113 enum ctx_state prev_state;
114 114
115 if (user_mode_vm(regs)) { 115 if (user_mode(regs)) {
116 /* Other than that, we're just an exception. */ 116 /* Other than that, we're just an exception. */
117 prev_state = exception_enter(); 117 prev_state = exception_enter();
118 } else { 118 } else {
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
123 * but we need to notify RCU. 123 * but we need to notify RCU.
124 */ 124 */
125 rcu_nmi_enter(); 125 rcu_nmi_enter();
126 prev_state = IN_KERNEL; /* the value is irrelevant. */ 126 prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */
127 } 127 }
128 128
129 /* 129 /*
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
146 /* Must be before exception_exit. */ 146 /* Must be before exception_exit. */
147 preempt_count_sub(HARDIRQ_OFFSET); 147 preempt_count_sub(HARDIRQ_OFFSET);
148 148
149 if (user_mode_vm(regs)) 149 if (user_mode(regs))
150 return exception_exit(prev_state); 150 return exception_exit(prev_state);
151 else 151 else
152 rcu_nmi_exit(); 152 rcu_nmi_exit();
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
158 * 158 *
159 * IST exception handlers normally cannot schedule. As a special 159 * IST exception handlers normally cannot schedule. As a special
160 * exception, if the exception interrupted userspace code (i.e. 160 * exception, if the exception interrupted userspace code (i.e.
161 * user_mode_vm(regs) would return true) and the exception was not 161 * user_mode(regs) would return true) and the exception was not
162 * a double fault, it can be safe to schedule. ist_begin_non_atomic() 162 * a double fault, it can be safe to schedule. ist_begin_non_atomic()
163 * begins a non-atomic section within an ist_enter()/ist_exit() region. 163 * begins a non-atomic section within an ist_enter()/ist_exit() region.
164 * Callers are responsible for enabling interrupts themselves inside 164 * Callers are responsible for enabling interrupts themselves inside
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
167 */ 167 */
168void ist_begin_non_atomic(struct pt_regs *regs) 168void ist_begin_non_atomic(struct pt_regs *regs)
169{ 169{
170 BUG_ON(!user_mode_vm(regs)); 170 BUG_ON(!user_mode(regs));
171 171
172 /* 172 /*
173 * Sanity check: we need to be on the normal thread stack. This 173 * Sanity check: we need to be on the normal thread stack. This
174 * will catch asm bugs and any attempt to use ist_preempt_enable 174 * will catch asm bugs and any attempt to use ist_preempt_enable
175 * from double_fault. 175 * from double_fault.
176 */ 176 */
177 BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) 177 BUG_ON((unsigned long)(current_top_of_stack() -
178 & ~(THREAD_SIZE - 1)) != 0); 178 current_stack_pointer()) >= THREAD_SIZE);
179 179
180 preempt_count_sub(HARDIRQ_OFFSET); 180 preempt_count_sub(HARDIRQ_OFFSET);
181} 181}
@@ -194,8 +194,7 @@ static nokprobe_inline int
194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, 194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
195 struct pt_regs *regs, long error_code) 195 struct pt_regs *regs, long error_code)
196{ 196{
197#ifdef CONFIG_X86_32 197 if (v8086_mode(regs)) {
198 if (regs->flags & X86_VM_MASK) {
199 /* 198 /*
200 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. 199 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
201 * On nmi (interrupt 2), do_trap should not be called. 200 * On nmi (interrupt 2), do_trap should not be called.
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
207 } 206 }
208 return -1; 207 return -1;
209 } 208 }
210#endif 209
211 if (!user_mode(regs)) { 210 if (!user_mode(regs)) {
212 if (!fixup_exception(regs)) { 211 if (!fixup_exception(regs)) {
213 tsk->thread.error_code = error_code; 212 tsk->thread.error_code = error_code;
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
384 goto exit; 383 goto exit;
385 conditional_sti(regs); 384 conditional_sti(regs);
386 385
387 if (!user_mode_vm(regs)) 386 if (!user_mode(regs))
388 die("bounds", regs, error_code); 387 die("bounds", regs, error_code);
389 388
390 if (!cpu_feature_enabled(X86_FEATURE_MPX)) { 389 if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
462 prev_state = exception_enter(); 461 prev_state = exception_enter();
463 conditional_sti(regs); 462 conditional_sti(regs);
464 463
465#ifdef CONFIG_X86_32 464 if (v8086_mode(regs)) {
466 if (regs->flags & X86_VM_MASK) {
467 local_irq_enable(); 465 local_irq_enable();
468 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 466 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
469 goto exit; 467 goto exit;
470 } 468 }
471#endif
472 469
473 tsk = current; 470 tsk = current;
474 if (!user_mode(regs)) { 471 if (!user_mode(regs)) {
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
587 /* Copy the remainder of the stack from the current stack. */ 584 /* Copy the remainder of the stack from the current stack. */
588 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); 585 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
589 586
590 BUG_ON(!user_mode_vm(&new_stack->regs)); 587 BUG_ON(!user_mode(&new_stack->regs));
591 return new_stack; 588 return new_stack;
592} 589}
593NOKPROBE_SYMBOL(fixup_bad_iret); 590NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
637 * then it's very likely the result of an icebp/int01 trap. 634 * then it's very likely the result of an icebp/int01 trap.
638 * User wants a sigtrap for that. 635 * User wants a sigtrap for that.
639 */ 636 */
640 if (!dr6 && user_mode_vm(regs)) 637 if (!dr6 && user_mode(regs))
641 user_icebp = 1; 638 user_icebp = 1;
642 639
643 /* Catch kmemcheck conditions first of all! */ 640 /* Catch kmemcheck conditions first of all! */
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
673 /* It's safe to allow irq's after DR6 has been saved */ 670 /* It's safe to allow irq's after DR6 has been saved */
674 preempt_conditional_sti(regs); 671 preempt_conditional_sti(regs);
675 672
676 if (regs->flags & X86_VM_MASK) { 673 if (v8086_mode(regs)) {
677 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 674 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
678 X86_TRAP_DB); 675 X86_TRAP_DB);
679 preempt_conditional_cli(regs); 676 preempt_conditional_cli(regs);
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
721 return; 718 return;
722 conditional_sti(regs); 719 conditional_sti(regs);
723 720
724 if (!user_mode_vm(regs)) 721 if (!user_mode(regs))
725 { 722 {
726 if (!fixup_exception(regs)) { 723 if (!fixup_exception(regs)) {
727 task->thread.error_code = error_code; 724 task->thread.error_code = error_code;
@@ -734,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
734 /* 731 /*
735 * Save the info for the exception handler and clear the error. 732 * Save the info for the exception handler and clear the error.
736 */ 733 */
737 save_init_fpu(task); 734 unlazy_fpu(task);
738 task->thread.trap_nr = trapnr; 735 task->thread.trap_nr = trapnr;
739 task->thread.error_code = error_code; 736 task->thread.error_code = error_code;
740 info.si_signo = SIGFPE; 737 info.si_signo = SIGFPE;
@@ -863,7 +860,7 @@ void math_state_restore(void)
863 kernel_fpu_disable(); 860 kernel_fpu_disable();
864 __thread_fpu_begin(tsk); 861 __thread_fpu_begin(tsk);
865 if (unlikely(restore_fpu_checking(tsk))) { 862 if (unlikely(restore_fpu_checking(tsk))) {
866 drop_init_fpu(tsk); 863 fpu_reset_state(tsk);
867 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); 864 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
868 } else { 865 } else {
869 tsk->thread.fpu_counter++; 866 tsk->thread.fpu_counter++;
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
925/* Set of traps needed for early debugging. */ 922/* Set of traps needed for early debugging. */
926void __init early_trap_init(void) 923void __init early_trap_init(void)
927{ 924{
928 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 925 /*
926 * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
927 * is ready in cpu_init() <-- trap_init(). Before trap_init(),
928 * CPU runs at ring 0 so it is impossible to hit an invalid
929 * stack. Using the original stack works well enough at this
930 * early stage. DEBUG_STACK will be equipped after cpu_init() in
931 * trap_init().
932 *
933 * We don't need to set trace_idt_table like set_intr_gate(),
934 * since we don't have trace_debug and it will be reset to
935 * 'debug' in trap_init() by set_intr_gate_ist().
936 */
937 set_intr_gate_notrace(X86_TRAP_DB, debug);
929 /* int3 can be called from all */ 938 /* int3 can be called from all */
930 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 939 set_system_intr_gate(X86_TRAP_BP, &int3);
931#ifdef CONFIG_X86_32 940#ifdef CONFIG_X86_32
932 set_intr_gate(X86_TRAP_PF, page_fault); 941 set_intr_gate(X86_TRAP_PF, page_fault);
933#endif 942#endif
@@ -1005,6 +1014,15 @@ void __init trap_init(void)
1005 */ 1014 */
1006 cpu_init(); 1015 cpu_init();
1007 1016
1017 /*
1018 * X86_TRAP_DB and X86_TRAP_BP have been set
1019 * in early_trap_init(). However, ITS works only after
1020 * cpu_init() loads TSS. See comments in early_trap_init().
1021 */
1022 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
1023 /* int3 can be called from all */
1024 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
1025
1008 x86_init.irqs.trap_init(); 1026 x86_init.irqs.trap_init();
1009 1027
1010#ifdef CONFIG_X86_64 1028#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 81f8adb0679e..0b81ad67da07 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
912 int ret = NOTIFY_DONE; 912 int ret = NOTIFY_DONE;
913 913
914 /* We are only interested in userspace traps */ 914 /* We are only interested in userspace traps */
915 if (regs && !user_mode_vm(regs)) 915 if (regs && !user_mode(regs))
916 return NOTIFY_DONE; 916 return NOTIFY_DONE;
917 917
918 switch (val) { 918 switch (val) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf52e069..fc9db6ef2a95 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
150 do_exit(SIGSEGV); 150 do_exit(SIGSEGV);
151 } 151 }
152 152
153 tss = &per_cpu(init_tss, get_cpu()); 153 tss = &per_cpu(cpu_tss, get_cpu());
154 current->thread.sp0 = current->thread.saved_sp0; 154 current->thread.sp0 = current->thread.saved_sp0;
155 current->thread.sysenter_cs = __KERNEL_CS; 155 current->thread.sysenter_cs = __KERNEL_CS;
156 load_sp0(tss, &current->thread); 156 load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 tsk->thread.saved_fs = info->regs32->fs; 318 tsk->thread.saved_fs = info->regs32->fs;
319 tsk->thread.saved_gs = get_user_gs(info->regs32); 319 tsk->thread.saved_gs = get_user_gs(info->regs32);
320 320
321 tss = &per_cpu(init_tss, get_cpu()); 321 tss = &per_cpu(cpu_tss, get_cpu());
322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
323 if (cpu_has_sep) 323 if (cpu_has_sep)
324 tsk->thread.sysenter_cs = 0; 324 tsk->thread.sysenter_cs = 0;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index cdc6cf903078..87a815b85f3e 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
342 config_enabled(CONFIG_IA32_EMULATION)); 342 config_enabled(CONFIG_IA32_EMULATION));
343 343
344 if (!buf) { 344 if (!buf) {
345 drop_init_fpu(tsk); 345 fpu_reset_state(tsk);
346 return 0; 346 return 0;
347 } 347 }
348 348
@@ -416,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
416 */ 416 */
417 user_fpu_begin(); 417 user_fpu_begin();
418 if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { 418 if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
419 drop_init_fpu(tsk); 419 fpu_reset_state(tsk);
420 return -1; 420 return -1;
421 } 421 }
422 } 422 }
@@ -678,19 +678,13 @@ void xsave_init(void)
678 this_func(); 678 this_func();
679} 679}
680 680
681static inline void __init eager_fpu_init_bp(void) 681/*
682{ 682 * setup_init_fpu_buf() is __init and it is OK to call it here because
683 current->thread.fpu.state = 683 * init_xstate_buf will be unset only once during boot.
684 alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); 684 */
685 if (!init_xstate_buf) 685void __init_refok eager_fpu_init(void)
686 setup_init_fpu_buf();
687}
688
689void eager_fpu_init(void)
690{ 686{
691 static __refdata void (*boot_func)(void) = eager_fpu_init_bp; 687 WARN_ON(used_math());
692
693 clear_used_math();
694 current_thread_info()->status = 0; 688 current_thread_info()->status = 0;
695 689
696 if (eagerfpu == ENABLE) 690 if (eagerfpu == ENABLE)
@@ -701,21 +695,8 @@ void eager_fpu_init(void)
701 return; 695 return;
702 } 696 }
703 697
704 if (boot_func) { 698 if (!init_xstate_buf)
705 boot_func(); 699 setup_init_fpu_buf();
706 boot_func = NULL;
707 }
708
709 /*
710 * This is same as math_state_restore(). But use_xsave() is
711 * not yet patched to use math_state_restore().
712 */
713 init_fpu(current);
714 __thread_fpu_begin(current);
715 if (cpu_has_xsave)
716 xrstor_state(init_xstate_buf, -1);
717 else
718 fxrstor_checking(&init_xstate_buf->i387);
719} 700}
720 701
721/* 702/*
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 08f790dfadc9..16e8f962eaad 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a80737ee6e6..59b69f6a2844 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
104 ((best->eax & 0xff00) >> 8) != 0) 104 ((best->eax & 0xff00) >> 8) != 0)
105 return -EINVAL; 105 return -EINVAL;
106 106
107 /* Update physical-address width */
108 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
109
107 kvm_pmu_cpuid_update(vcpu); 110 kvm_pmu_cpuid_update(vcpu);
108 return 0; 111 return 0;
109} 112}
@@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
135 } 138 }
136} 139}
137 140
141int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
142{
143 struct kvm_cpuid_entry2 *best;
144
145 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
146 if (!best || best->eax < 0x80000008)
147 goto not_found;
148 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
149 if (best)
150 return best->eax & 0xff;
151not_found:
152 return 36;
153}
154EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
155
138/* when an old userspace process fills a new kernel module */ 156/* when an old userspace process fills a new kernel module */
139int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 157int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
140 struct kvm_cpuid *cpuid, 158 struct kvm_cpuid *cpuid,
@@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
757} 775}
758EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); 776EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
759 777
760int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
761{
762 struct kvm_cpuid_entry2 *best;
763
764 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
765 if (!best || best->eax < 0x80000008)
766 goto not_found;
767 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
768 if (best)
769 return best->eax & 0xff;
770not_found:
771 return 36;
772}
773EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
774
775/* 778/*
776 * If no match is found, check whether we exceed the vCPU's limit 779 * If no match is found, check whether we exceed the vCPU's limit
777 * and return the content of the highest valid _standard_ leaf instead. 780 * and return the content of the highest valid _standard_ leaf instead.
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 4452eedfaedd..c3b1ad9fca81 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -20,13 +20,19 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
20 struct kvm_cpuid_entry2 __user *entries); 20 struct kvm_cpuid_entry2 __user *entries);
21void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 21void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
22 22
23int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
24
25static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
26{
27 return vcpu->arch.maxphyaddr;
28}
23 29
24static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 30static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
25{ 31{
26 struct kvm_cpuid_entry2 *best; 32 struct kvm_cpuid_entry2 *best;
27 33
28 if (!static_cpu_has(X86_FEATURE_XSAVE)) 34 if (!static_cpu_has(X86_FEATURE_XSAVE))
29 return 0; 35 return false;
30 36
31 best = kvm_find_cpuid_entry(vcpu, 1, 0); 37 best = kvm_find_cpuid_entry(vcpu, 1, 0);
32 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 38 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 106c01557f2b..630bcb0d7a04 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -248,27 +248,7 @@ struct mode_dual {
248 struct opcode mode64; 248 struct opcode mode64;
249}; 249};
250 250
251/* EFLAGS bit definitions. */
252#define EFLG_ID (1<<21)
253#define EFLG_VIP (1<<20)
254#define EFLG_VIF (1<<19)
255#define EFLG_AC (1<<18)
256#define EFLG_VM (1<<17)
257#define EFLG_RF (1<<16)
258#define EFLG_IOPL (3<<12)
259#define EFLG_NT (1<<14)
260#define EFLG_OF (1<<11)
261#define EFLG_DF (1<<10)
262#define EFLG_IF (1<<9)
263#define EFLG_TF (1<<8)
264#define EFLG_SF (1<<7)
265#define EFLG_ZF (1<<6)
266#define EFLG_AF (1<<4)
267#define EFLG_PF (1<<2)
268#define EFLG_CF (1<<0)
269
270#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a 251#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
271#define EFLG_RESERVED_ONE_MASK 2
272 252
273enum x86_transfer_type { 253enum x86_transfer_type {
274 X86_TRANSFER_NONE, 254 X86_TRANSFER_NONE,
@@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
317 * These EFLAGS bits are restored from saved value during emulation, and 297 * These EFLAGS bits are restored from saved value during emulation, and
318 * any changes are written back to the saved value after emulation. 298 * any changes are written back to the saved value after emulation.
319 */ 299 */
320#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) 300#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
301 X86_EFLAGS_PF|X86_EFLAGS_CF)
321 302
322#ifdef CONFIG_X86_64 303#ifdef CONFIG_X86_64
323#define ON64(x) x 304#define ON64(x) x
@@ -478,6 +459,25 @@ static void assign_masked(ulong *dest, ulong src, ulong mask)
478 *dest = (*dest & ~mask) | (src & mask); 459 *dest = (*dest & ~mask) | (src & mask);
479} 460}
480 461
462static void assign_register(unsigned long *reg, u64 val, int bytes)
463{
464 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
465 switch (bytes) {
466 case 1:
467 *(u8 *)reg = (u8)val;
468 break;
469 case 2:
470 *(u16 *)reg = (u16)val;
471 break;
472 case 4:
473 *reg = (u32)val;
474 break; /* 64b: zero-extend */
475 case 8:
476 *reg = val;
477 break;
478 }
479}
480
481static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) 481static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
482{ 482{
483 return (1UL << (ctxt->ad_bytes << 3)) - 1; 483 return (1UL << (ctxt->ad_bytes << 3)) - 1;
@@ -943,6 +943,22 @@ FASTOP2(xadd);
943 943
944FASTOP2R(cmp, cmp_r); 944FASTOP2R(cmp, cmp_r);
945 945
946static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
947{
948 /* If src is zero, do not writeback, but update flags */
949 if (ctxt->src.val == 0)
950 ctxt->dst.type = OP_NONE;
951 return fastop(ctxt, em_bsf);
952}
953
954static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
955{
956 /* If src is zero, do not writeback, but update flags */
957 if (ctxt->src.val == 0)
958 ctxt->dst.type = OP_NONE;
959 return fastop(ctxt, em_bsr);
960}
961
946static u8 test_cc(unsigned int condition, unsigned long flags) 962static u8 test_cc(unsigned int condition, unsigned long flags)
947{ 963{
948 u8 rc; 964 u8 rc;
@@ -1399,7 +1415,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1399 unsigned int in_page, n; 1415 unsigned int in_page, n;
1400 unsigned int count = ctxt->rep_prefix ? 1416 unsigned int count = ctxt->rep_prefix ?
1401 address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; 1417 address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
1402 in_page = (ctxt->eflags & EFLG_DF) ? 1418 in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
1403 offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : 1419 offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
1404 PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); 1420 PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
1405 n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); 1421 n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
@@ -1412,7 +1428,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1412 } 1428 }
1413 1429
1414 if (ctxt->rep_prefix && (ctxt->d & String) && 1430 if (ctxt->rep_prefix && (ctxt->d & String) &&
1415 !(ctxt->eflags & EFLG_DF)) { 1431 !(ctxt->eflags & X86_EFLAGS_DF)) {
1416 ctxt->dst.data = rc->data + rc->pos; 1432 ctxt->dst.data = rc->data + rc->pos;
1417 ctxt->dst.type = OP_MEM_STR; 1433 ctxt->dst.type = OP_MEM_STR;
1418 ctxt->dst.count = (rc->end - rc->pos) / size; 1434 ctxt->dst.count = (rc->end - rc->pos) / size;
@@ -1691,21 +1707,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1691 1707
1692static void write_register_operand(struct operand *op) 1708static void write_register_operand(struct operand *op)
1693{ 1709{
1694 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ 1710 return assign_register(op->addr.reg, op->val, op->bytes);
1695 switch (op->bytes) {
1696 case 1:
1697 *(u8 *)op->addr.reg = (u8)op->val;
1698 break;
1699 case 2:
1700 *(u16 *)op->addr.reg = (u16)op->val;
1701 break;
1702 case 4:
1703 *op->addr.reg = (u32)op->val;
1704 break; /* 64b: zero-extend */
1705 case 8:
1706 *op->addr.reg = op->val;
1707 break;
1708 }
1709} 1711}
1710 1712
1711static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) 1713static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
@@ -1792,32 +1794,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1792{ 1794{
1793 int rc; 1795 int rc;
1794 unsigned long val, change_mask; 1796 unsigned long val, change_mask;
1795 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1797 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
1796 int cpl = ctxt->ops->cpl(ctxt); 1798 int cpl = ctxt->ops->cpl(ctxt);
1797 1799
1798 rc = emulate_pop(ctxt, &val, len); 1800 rc = emulate_pop(ctxt, &val, len);
1799 if (rc != X86EMUL_CONTINUE) 1801 if (rc != X86EMUL_CONTINUE)
1800 return rc; 1802 return rc;
1801 1803
1802 change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF 1804 change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
1803 | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; 1805 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
1806 X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
1807 X86_EFLAGS_AC | X86_EFLAGS_ID;
1804 1808
1805 switch(ctxt->mode) { 1809 switch(ctxt->mode) {
1806 case X86EMUL_MODE_PROT64: 1810 case X86EMUL_MODE_PROT64:
1807 case X86EMUL_MODE_PROT32: 1811 case X86EMUL_MODE_PROT32:
1808 case X86EMUL_MODE_PROT16: 1812 case X86EMUL_MODE_PROT16:
1809 if (cpl == 0) 1813 if (cpl == 0)
1810 change_mask |= EFLG_IOPL; 1814 change_mask |= X86_EFLAGS_IOPL;
1811 if (cpl <= iopl) 1815 if (cpl <= iopl)
1812 change_mask |= EFLG_IF; 1816 change_mask |= X86_EFLAGS_IF;
1813 break; 1817 break;
1814 case X86EMUL_MODE_VM86: 1818 case X86EMUL_MODE_VM86:
1815 if (iopl < 3) 1819 if (iopl < 3)
1816 return emulate_gp(ctxt, 0); 1820 return emulate_gp(ctxt, 0);
1817 change_mask |= EFLG_IF; 1821 change_mask |= X86_EFLAGS_IF;
1818 break; 1822 break;
1819 default: /* real mode */ 1823 default: /* real mode */
1820 change_mask |= (EFLG_IOPL | EFLG_IF); 1824 change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
1821 break; 1825 break;
1822 } 1826 }
1823 1827
@@ -1918,7 +1922,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
1918 1922
1919static int em_pushf(struct x86_emulate_ctxt *ctxt) 1923static int em_pushf(struct x86_emulate_ctxt *ctxt)
1920{ 1924{
1921 ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM; 1925 ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
1922 return em_push(ctxt); 1926 return em_push(ctxt);
1923} 1927}
1924 1928
@@ -1926,6 +1930,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1926{ 1930{
1927 int rc = X86EMUL_CONTINUE; 1931 int rc = X86EMUL_CONTINUE;
1928 int reg = VCPU_REGS_RDI; 1932 int reg = VCPU_REGS_RDI;
1933 u32 val;
1929 1934
1930 while (reg >= VCPU_REGS_RAX) { 1935 while (reg >= VCPU_REGS_RAX) {
1931 if (reg == VCPU_REGS_RSP) { 1936 if (reg == VCPU_REGS_RSP) {
@@ -1933,9 +1938,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1933 --reg; 1938 --reg;
1934 } 1939 }
1935 1940
1936 rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes); 1941 rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
1937 if (rc != X86EMUL_CONTINUE) 1942 if (rc != X86EMUL_CONTINUE)
1938 break; 1943 break;
1944 assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
1939 --reg; 1945 --reg;
1940 } 1946 }
1941 return rc; 1947 return rc;
@@ -1956,7 +1962,7 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1956 if (rc != X86EMUL_CONTINUE) 1962 if (rc != X86EMUL_CONTINUE)
1957 return rc; 1963 return rc;
1958 1964
1959 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); 1965 ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
1960 1966
1961 ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); 1967 ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1962 rc = em_push(ctxt); 1968 rc = em_push(ctxt);
@@ -2022,10 +2028,14 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
2022 unsigned long temp_eip = 0; 2028 unsigned long temp_eip = 0;
2023 unsigned long temp_eflags = 0; 2029 unsigned long temp_eflags = 0;
2024 unsigned long cs = 0; 2030 unsigned long cs = 0;
2025 unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | 2031 unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
2026 EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | 2032 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
2027 EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ 2033 X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
2028 unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; 2034 X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
2035 X86_EFLAGS_AC | X86_EFLAGS_ID |
2036 X86_EFLAGS_FIXED;
2037 unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
2038 X86_EFLAGS_VIP;
2029 2039
2030 /* TODO: Add stack limit check */ 2040 /* TODO: Add stack limit check */
2031 2041
@@ -2054,7 +2064,6 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
2054 2064
2055 ctxt->_eip = temp_eip; 2065 ctxt->_eip = temp_eip;
2056 2066
2057
2058 if (ctxt->op_bytes == 4) 2067 if (ctxt->op_bytes == 4)
2059 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); 2068 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
2060 else if (ctxt->op_bytes == 2) { 2069 else if (ctxt->op_bytes == 2) {
@@ -2063,7 +2072,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
2063 } 2072 }
2064 2073
2065 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ 2074 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
2066 ctxt->eflags |= EFLG_RESERVED_ONE_MASK; 2075 ctxt->eflags |= X86_EFLAGS_FIXED;
2067 ctxt->ops->set_nmi_mask(ctxt, false); 2076 ctxt->ops->set_nmi_mask(ctxt, false);
2068 2077
2069 return rc; 2078 return rc;
@@ -2145,12 +2154,12 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
2145 ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { 2154 ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
2146 *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); 2155 *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
2147 *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); 2156 *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
2148 ctxt->eflags &= ~EFLG_ZF; 2157 ctxt->eflags &= ~X86_EFLAGS_ZF;
2149 } else { 2158 } else {
2150 ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | 2159 ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
2151 (u32) reg_read(ctxt, VCPU_REGS_RBX); 2160 (u32) reg_read(ctxt, VCPU_REGS_RBX);
2152 2161
2153 ctxt->eflags |= EFLG_ZF; 2162 ctxt->eflags |= X86_EFLAGS_ZF;
2154 } 2163 }
2155 return X86EMUL_CONTINUE; 2164 return X86EMUL_CONTINUE;
2156} 2165}
@@ -2222,7 +2231,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2222 ctxt->src.val = ctxt->dst.orig_val; 2231 ctxt->src.val = ctxt->dst.orig_val;
2223 fastop(ctxt, em_cmp); 2232 fastop(ctxt, em_cmp);
2224 2233
2225 if (ctxt->eflags & EFLG_ZF) { 2234 if (ctxt->eflags & X86_EFLAGS_ZF) {
2226 /* Success: write back to memory; no update of EAX */ 2235 /* Success: write back to memory; no update of EAX */
2227 ctxt->src.type = OP_NONE; 2236 ctxt->src.type = OP_NONE;
2228 ctxt->dst.val = ctxt->src.orig_val; 2237 ctxt->dst.val = ctxt->src.orig_val;
@@ -2381,14 +2390,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
2381 2390
2382 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); 2391 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
2383 ctxt->eflags &= ~msr_data; 2392 ctxt->eflags &= ~msr_data;
2384 ctxt->eflags |= EFLG_RESERVED_ONE_MASK; 2393 ctxt->eflags |= X86_EFLAGS_FIXED;
2385#endif 2394#endif
2386 } else { 2395 } else {
2387 /* legacy mode */ 2396 /* legacy mode */
2388 ops->get_msr(ctxt, MSR_STAR, &msr_data); 2397 ops->get_msr(ctxt, MSR_STAR, &msr_data);
2389 ctxt->_eip = (u32)msr_data; 2398 ctxt->_eip = (u32)msr_data;
2390 2399
2391 ctxt->eflags &= ~(EFLG_VM | EFLG_IF); 2400 ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
2392 } 2401 }
2393 2402
2394 return X86EMUL_CONTINUE; 2403 return X86EMUL_CONTINUE;
@@ -2425,8 +2434,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2425 if ((msr_data & 0xfffc) == 0x0) 2434 if ((msr_data & 0xfffc) == 0x0)
2426 return emulate_gp(ctxt, 0); 2435 return emulate_gp(ctxt, 0);
2427 2436
2428 ctxt->eflags &= ~(EFLG_VM | EFLG_IF); 2437 ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
2429 cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK; 2438 cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
2430 ss_sel = cs_sel + 8; 2439 ss_sel = cs_sel + 8;
2431 if (efer & EFER_LMA) { 2440 if (efer & EFER_LMA) {
2432 cs.d = 0; 2441 cs.d = 0;
@@ -2493,8 +2502,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2493 return emulate_gp(ctxt, 0); 2502 return emulate_gp(ctxt, 0);
2494 break; 2503 break;
2495 } 2504 }
2496 cs_sel |= SELECTOR_RPL_MASK; 2505 cs_sel |= SEGMENT_RPL_MASK;
2497 ss_sel |= SELECTOR_RPL_MASK; 2506 ss_sel |= SEGMENT_RPL_MASK;
2498 2507
2499 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2508 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2500 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2509 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
@@ -2512,7 +2521,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2512 return false; 2521 return false;
2513 if (ctxt->mode == X86EMUL_MODE_VM86) 2522 if (ctxt->mode == X86EMUL_MODE_VM86)
2514 return true; 2523 return true;
2515 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2524 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
2516 return ctxt->ops->cpl(ctxt) > iopl; 2525 return ctxt->ops->cpl(ctxt) > iopl;
2517} 2526}
2518 2527
@@ -2782,10 +2791,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2782 return ret; 2791 return ret;
2783 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, 2792 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
2784 X86_TRANSFER_TASK_SWITCH, NULL); 2793 X86_TRANSFER_TASK_SWITCH, NULL);
2785 if (ret != X86EMUL_CONTINUE)
2786 return ret;
2787 2794
2788 return X86EMUL_CONTINUE; 2795 return ret;
2789} 2796}
2790 2797
2791static int task_switch_32(struct x86_emulate_ctxt *ctxt, 2798static int task_switch_32(struct x86_emulate_ctxt *ctxt,
@@ -2954,7 +2961,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2954static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, 2961static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
2955 struct operand *op) 2962 struct operand *op)
2956{ 2963{
2957 int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; 2964 int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
2958 2965
2959 register_address_increment(ctxt, reg, df * op->bytes); 2966 register_address_increment(ctxt, reg, df * op->bytes);
2960 op->addr.mem.ea = register_address(ctxt, reg); 2967 op->addr.mem.ea = register_address(ctxt, reg);
@@ -3323,7 +3330,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
3323 return X86EMUL_CONTINUE; 3330 return X86EMUL_CONTINUE;
3324} 3331}
3325 3332
3326static int em_vmcall(struct x86_emulate_ctxt *ctxt) 3333static int em_hypercall(struct x86_emulate_ctxt *ctxt)
3327{ 3334{
3328 int rc = ctxt->ops->fix_hypercall(ctxt); 3335 int rc = ctxt->ops->fix_hypercall(ctxt);
3329 3336
@@ -3395,17 +3402,6 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt)
3395 return em_lgdt_lidt(ctxt, true); 3402 return em_lgdt_lidt(ctxt, true);
3396} 3403}
3397 3404
3398static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
3399{
3400 int rc;
3401
3402 rc = ctxt->ops->fix_hypercall(ctxt);
3403
3404 /* Disable writeback. */
3405 ctxt->dst.type = OP_NONE;
3406 return rc;
3407}
3408
3409static int em_lidt(struct x86_emulate_ctxt *ctxt) 3405static int em_lidt(struct x86_emulate_ctxt *ctxt)
3410{ 3406{
3411 return em_lgdt_lidt(ctxt, false); 3407 return em_lgdt_lidt(ctxt, false);
@@ -3504,7 +3500,8 @@ static int em_sahf(struct x86_emulate_ctxt *ctxt)
3504{ 3500{
3505 u32 flags; 3501 u32 flags;
3506 3502
3507 flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF; 3503 flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
3504 X86_EFLAGS_SF;
3508 flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; 3505 flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
3509 3506
3510 ctxt->eflags &= ~0xffUL; 3507 ctxt->eflags &= ~0xffUL;
@@ -3769,7 +3766,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3769 3766
3770static const struct opcode group7_rm0[] = { 3767static const struct opcode group7_rm0[] = {
3771 N, 3768 N,
3772 I(SrcNone | Priv | EmulateOnUD, em_vmcall), 3769 I(SrcNone | Priv | EmulateOnUD, em_hypercall),
3773 N, N, N, N, N, N, 3770 N, N, N, N, N, N,
3774}; 3771};
3775 3772
@@ -3781,7 +3778,7 @@ static const struct opcode group7_rm1[] = {
3781 3778
3782static const struct opcode group7_rm3[] = { 3779static const struct opcode group7_rm3[] = {
3783 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), 3780 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
3784 II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall), 3781 II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
3785 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), 3782 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
3786 DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), 3783 DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
3787 DIP(SrcNone | Prot | Priv, stgi, check_svme), 3784 DIP(SrcNone | Prot | Priv, stgi, check_svme),
@@ -4192,7 +4189,8 @@ static const struct opcode twobyte_table[256] = {
4192 N, N, 4189 N, N,
4193 G(BitOp, group8), 4190 G(BitOp, group8),
4194 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 4191 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
4195 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), 4192 I(DstReg | SrcMem | ModRM, em_bsf_c),
4193 I(DstReg | SrcMem | ModRM, em_bsr_c),
4196 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 4194 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
4197 /* 0xC0 - 0xC7 */ 4195 /* 0xC0 - 0xC7 */
4198 F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), 4196 F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
@@ -4759,9 +4757,9 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
4759 if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || 4757 if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
4760 (ctxt->b == 0xae) || (ctxt->b == 0xaf)) 4758 (ctxt->b == 0xae) || (ctxt->b == 0xaf))
4761 && (((ctxt->rep_prefix == REPE_PREFIX) && 4759 && (((ctxt->rep_prefix == REPE_PREFIX) &&
4762 ((ctxt->eflags & EFLG_ZF) == 0)) 4760 ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
4763 || ((ctxt->rep_prefix == REPNE_PREFIX) && 4761 || ((ctxt->rep_prefix == REPNE_PREFIX) &&
4764 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) 4762 ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
4765 return true; 4763 return true;
4766 4764
4767 return false; 4765 return false;
@@ -4913,7 +4911,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4913 /* All REP prefixes have the same first termination condition */ 4911 /* All REP prefixes have the same first termination condition */
4914 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { 4912 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
4915 ctxt->eip = ctxt->_eip; 4913 ctxt->eip = ctxt->_eip;
4916 ctxt->eflags &= ~EFLG_RF; 4914 ctxt->eflags &= ~X86_EFLAGS_RF;
4917 goto done; 4915 goto done;
4918 } 4916 }
4919 } 4917 }
@@ -4963,9 +4961,9 @@ special_insn:
4963 } 4961 }
4964 4962
4965 if (ctxt->rep_prefix && (ctxt->d & String)) 4963 if (ctxt->rep_prefix && (ctxt->d & String))
4966 ctxt->eflags |= EFLG_RF; 4964 ctxt->eflags |= X86_EFLAGS_RF;
4967 else 4965 else
4968 ctxt->eflags &= ~EFLG_RF; 4966 ctxt->eflags &= ~X86_EFLAGS_RF;
4969 4967
4970 if (ctxt->execute) { 4968 if (ctxt->execute) {
4971 if (ctxt->d & Fastop) { 4969 if (ctxt->d & Fastop) {
@@ -5014,7 +5012,7 @@ special_insn:
5014 rc = emulate_int(ctxt, ctxt->src.val); 5012 rc = emulate_int(ctxt, ctxt->src.val);
5015 break; 5013 break;
5016 case 0xce: /* into */ 5014 case 0xce: /* into */
5017 if (ctxt->eflags & EFLG_OF) 5015 if (ctxt->eflags & X86_EFLAGS_OF)
5018 rc = emulate_int(ctxt, 4); 5016 rc = emulate_int(ctxt, 4);
5019 break; 5017 break;
5020 case 0xe9: /* jmp rel */ 5018 case 0xe9: /* jmp rel */
@@ -5027,19 +5025,19 @@ special_insn:
5027 break; 5025 break;
5028 case 0xf5: /* cmc */ 5026 case 0xf5: /* cmc */
5029 /* complement carry flag from eflags reg */ 5027 /* complement carry flag from eflags reg */
5030 ctxt->eflags ^= EFLG_CF; 5028 ctxt->eflags ^= X86_EFLAGS_CF;
5031 break; 5029 break;
5032 case 0xf8: /* clc */ 5030 case 0xf8: /* clc */
5033 ctxt->eflags &= ~EFLG_CF; 5031 ctxt->eflags &= ~X86_EFLAGS_CF;
5034 break; 5032 break;
5035 case 0xf9: /* stc */ 5033 case 0xf9: /* stc */
5036 ctxt->eflags |= EFLG_CF; 5034 ctxt->eflags |= X86_EFLAGS_CF;
5037 break; 5035 break;
5038 case 0xfc: /* cld */ 5036 case 0xfc: /* cld */
5039 ctxt->eflags &= ~EFLG_DF; 5037 ctxt->eflags &= ~X86_EFLAGS_DF;
5040 break; 5038 break;
5041 case 0xfd: /* std */ 5039 case 0xfd: /* std */
5042 ctxt->eflags |= EFLG_DF; 5040 ctxt->eflags |= X86_EFLAGS_DF;
5043 break; 5041 break;
5044 default: 5042 default:
5045 goto cannot_emulate; 5043 goto cannot_emulate;
@@ -5100,7 +5098,7 @@ writeback:
5100 } 5098 }
5101 goto done; /* skip rip writeback */ 5099 goto done; /* skip rip writeback */
5102 } 5100 }
5103 ctxt->eflags &= ~EFLG_RF; 5101 ctxt->eflags &= ~X86_EFLAGS_RF;
5104 } 5102 }
5105 5103
5106 ctxt->eip = ctxt->_eip; 5104 ctxt->eip = ctxt->_eip;
@@ -5137,8 +5135,7 @@ twobyte_insn:
5137 case 0x40 ... 0x4f: /* cmov */ 5135 case 0x40 ... 0x4f: /* cmov */
5138 if (test_cc(ctxt->b, ctxt->eflags)) 5136 if (test_cc(ctxt->b, ctxt->eflags))
5139 ctxt->dst.val = ctxt->src.val; 5137 ctxt->dst.val = ctxt->src.val;
5140 else if (ctxt->mode != X86EMUL_MODE_PROT64 || 5138 else if (ctxt->op_bytes != 4)
5141 ctxt->op_bytes != 4)
5142 ctxt->dst.type = OP_NONE; /* no writeback */ 5139 ctxt->dst.type = OP_NONE; /* no writeback */
5143 break; 5140 break;
5144 case 0x80 ... 0x8f: /* jnz rel, etc*/ 5141 case 0x80 ... 0x8f: /* jnz rel, etc*/
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 298781d4cfb4..4dce6f8b6129 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -443,7 +443,8 @@ static inline int pit_in_range(gpa_t addr)
443 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); 443 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
444} 444}
445 445
446static int pit_ioport_write(struct kvm_io_device *this, 446static int pit_ioport_write(struct kvm_vcpu *vcpu,
447 struct kvm_io_device *this,
447 gpa_t addr, int len, const void *data) 448 gpa_t addr, int len, const void *data)
448{ 449{
449 struct kvm_pit *pit = dev_to_pit(this); 450 struct kvm_pit *pit = dev_to_pit(this);
@@ -519,7 +520,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
519 return 0; 520 return 0;
520} 521}
521 522
522static int pit_ioport_read(struct kvm_io_device *this, 523static int pit_ioport_read(struct kvm_vcpu *vcpu,
524 struct kvm_io_device *this,
523 gpa_t addr, int len, void *data) 525 gpa_t addr, int len, void *data)
524{ 526{
525 struct kvm_pit *pit = dev_to_pit(this); 527 struct kvm_pit *pit = dev_to_pit(this);
@@ -589,7 +591,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
589 return 0; 591 return 0;
590} 592}
591 593
592static int speaker_ioport_write(struct kvm_io_device *this, 594static int speaker_ioport_write(struct kvm_vcpu *vcpu,
595 struct kvm_io_device *this,
593 gpa_t addr, int len, const void *data) 596 gpa_t addr, int len, const void *data)
594{ 597{
595 struct kvm_pit *pit = speaker_to_pit(this); 598 struct kvm_pit *pit = speaker_to_pit(this);
@@ -606,8 +609,9 @@ static int speaker_ioport_write(struct kvm_io_device *this,
606 return 0; 609 return 0;
607} 610}
608 611
609static int speaker_ioport_read(struct kvm_io_device *this, 612static int speaker_ioport_read(struct kvm_vcpu *vcpu,
610 gpa_t addr, int len, void *data) 613 struct kvm_io_device *this,
614 gpa_t addr, int len, void *data)
611{ 615{
612 struct kvm_pit *pit = speaker_to_pit(this); 616 struct kvm_pit *pit = speaker_to_pit(this);
613 struct kvm_kpit_state *pit_state = &pit->pit_state; 617 struct kvm_kpit_state *pit_state = &pit->pit_state;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index dd1b16b611b0..c84990b42b5b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,7 +3,7 @@
3 3
4#include <linux/kthread.h> 4#include <linux/kthread.h>
5 5
6#include "iodev.h" 6#include <kvm/iodev.h>
7 7
8struct kvm_kpit_channel_state { 8struct kvm_kpit_channel_state {
9 u32 count; /* can be 65536 */ 9 u32 count; /* can be 65536 */
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 9541ba34126b..fef922ff2635 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -529,42 +529,42 @@ static int picdev_read(struct kvm_pic *s,
529 return 0; 529 return 0;
530} 530}
531 531
532static int picdev_master_write(struct kvm_io_device *dev, 532static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
533 gpa_t addr, int len, const void *val) 533 gpa_t addr, int len, const void *val)
534{ 534{
535 return picdev_write(container_of(dev, struct kvm_pic, dev_master), 535 return picdev_write(container_of(dev, struct kvm_pic, dev_master),
536 addr, len, val); 536 addr, len, val);
537} 537}
538 538
539static int picdev_master_read(struct kvm_io_device *dev, 539static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
540 gpa_t addr, int len, void *val) 540 gpa_t addr, int len, void *val)
541{ 541{
542 return picdev_read(container_of(dev, struct kvm_pic, dev_master), 542 return picdev_read(container_of(dev, struct kvm_pic, dev_master),
543 addr, len, val); 543 addr, len, val);
544} 544}
545 545
546static int picdev_slave_write(struct kvm_io_device *dev, 546static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
547 gpa_t addr, int len, const void *val) 547 gpa_t addr, int len, const void *val)
548{ 548{
549 return picdev_write(container_of(dev, struct kvm_pic, dev_slave), 549 return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
550 addr, len, val); 550 addr, len, val);
551} 551}
552 552
553static int picdev_slave_read(struct kvm_io_device *dev, 553static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
554 gpa_t addr, int len, void *val) 554 gpa_t addr, int len, void *val)
555{ 555{
556 return picdev_read(container_of(dev, struct kvm_pic, dev_slave), 556 return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
557 addr, len, val); 557 addr, len, val);
558} 558}
559 559
560static int picdev_eclr_write(struct kvm_io_device *dev, 560static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
561 gpa_t addr, int len, const void *val) 561 gpa_t addr, int len, const void *val)
562{ 562{
563 return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), 563 return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
564 addr, len, val); 564 addr, len, val);
565} 565}
566 566
567static int picdev_eclr_read(struct kvm_io_device *dev, 567static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
568 gpa_t addr, int len, void *val) 568 gpa_t addr, int len, void *val)
569{ 569{
570 return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), 570 return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 46d4449772bc..28146f03c514 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
206 206
207 old_irr = ioapic->irr; 207 old_irr = ioapic->irr;
208 ioapic->irr |= mask; 208 ioapic->irr |= mask;
209 if (edge)
210 ioapic->irr_delivered &= ~mask;
209 if ((edge && old_irr == ioapic->irr) || 211 if ((edge && old_irr == ioapic->irr) ||
210 (!edge && entry.fields.remote_irr)) { 212 (!edge && entry.fields.remote_irr)) {
211 ret = 0; 213 ret = 0;
@@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
349 irqe.shorthand = 0; 351 irqe.shorthand = 0;
350 352
351 if (irqe.trig_mode == IOAPIC_EDGE_TRIG) 353 if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
352 ioapic->irr &= ~(1 << irq); 354 ioapic->irr_delivered |= 1 << irq;
353 355
354 if (irq == RTC_GSI && line_status) { 356 if (irq == RTC_GSI && line_status) {
355 /* 357 /*
@@ -473,13 +475,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
473 } 475 }
474} 476}
475 477
476bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
477{
478 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
479 smp_rmb();
480 return test_bit(vector, ioapic->handled_vectors);
481}
482
483void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) 478void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
484{ 479{
485 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 480 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
@@ -500,8 +495,8 @@ static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
500 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); 495 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
501} 496}
502 497
503static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, 498static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
504 void *val) 499 gpa_t addr, int len, void *val)
505{ 500{
506 struct kvm_ioapic *ioapic = to_ioapic(this); 501 struct kvm_ioapic *ioapic = to_ioapic(this);
507 u32 result; 502 u32 result;
@@ -543,8 +538,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
543 return 0; 538 return 0;
544} 539}
545 540
546static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, 541static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
547 const void *val) 542 gpa_t addr, int len, const void *val)
548{ 543{
549 struct kvm_ioapic *ioapic = to_ioapic(this); 544 struct kvm_ioapic *ioapic = to_ioapic(this);
550 u32 data; 545 u32 data;
@@ -599,6 +594,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
599 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; 594 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
600 ioapic->ioregsel = 0; 595 ioapic->ioregsel = 0;
601 ioapic->irr = 0; 596 ioapic->irr = 0;
597 ioapic->irr_delivered = 0;
602 ioapic->id = 0; 598 ioapic->id = 0;
603 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); 599 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
604 rtc_irq_eoi_tracking_reset(ioapic); 600 rtc_irq_eoi_tracking_reset(ioapic);
@@ -656,6 +652,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
656 652
657 spin_lock(&ioapic->lock); 653 spin_lock(&ioapic->lock);
658 memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); 654 memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
655 state->irr &= ~ioapic->irr_delivered;
659 spin_unlock(&ioapic->lock); 656 spin_unlock(&ioapic->lock);
660 return 0; 657 return 0;
661} 658}
@@ -669,6 +666,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
669 spin_lock(&ioapic->lock); 666 spin_lock(&ioapic->lock);
670 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); 667 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
671 ioapic->irr = 0; 668 ioapic->irr = 0;
669 ioapic->irr_delivered = 0;
672 update_handled_vectors(ioapic); 670 update_handled_vectors(ioapic);
673 kvm_vcpu_request_scan_ioapic(kvm); 671 kvm_vcpu_request_scan_ioapic(kvm);
674 kvm_ioapic_inject_all(ioapic, state->irr); 672 kvm_ioapic_inject_all(ioapic, state->irr);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index c2e36d934af4..ca0b0b4e6256 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -3,7 +3,7 @@
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5 5
6#include "iodev.h" 6#include <kvm/iodev.h>
7 7
8struct kvm; 8struct kvm;
9struct kvm_vcpu; 9struct kvm_vcpu;
@@ -77,6 +77,7 @@ struct kvm_ioapic {
77 struct rtc_status rtc_status; 77 struct rtc_status rtc_status;
78 struct delayed_work eoi_inject; 78 struct delayed_work eoi_inject;
79 u32 irq_eoi[IOAPIC_NUM_PINS]; 79 u32 irq_eoi[IOAPIC_NUM_PINS];
80 u32 irr_delivered;
80}; 81};
81 82
82#ifdef DEBUG 83#ifdef DEBUG
@@ -97,13 +98,19 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
97 return kvm->arch.vioapic; 98 return kvm->arch.vioapic;
98} 99}
99 100
101static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
102{
103 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
104 smp_rmb();
105 return test_bit(vector, ioapic->handled_vectors);
106}
107
100void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); 108void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
101bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 109bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
102 int short_hand, unsigned int dest, int dest_mode); 110 int short_hand, unsigned int dest, int dest_mode);
103int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 111int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
104void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, 112void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
105 int trigger_mode); 113 int trigger_mode);
106bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
107int kvm_ioapic_init(struct kvm *kvm); 114int kvm_ioapic_init(struct kvm *kvm);
108void kvm_ioapic_destroy(struct kvm *kvm); 115void kvm_ioapic_destroy(struct kvm *kvm);
109int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 116int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2d03568e9498..ad68c73008c5 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -27,7 +27,7 @@
27#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29 29
30#include "iodev.h" 30#include <kvm/iodev.h>
31#include "ioapic.h" 31#include "ioapic.h"
32#include "lapic.h" 32#include "lapic.h"
33 33
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4ee827d7bf36..d67206a7b99a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -133,6 +133,28 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
133 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 133 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
134} 134}
135 135
136/* The logical map is definitely wrong if we have multiple
137 * modes at the same time. (Physical map is always right.)
138 */
139static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
140{
141 return !(map->mode & (map->mode - 1));
142}
143
144static inline void
145apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
146{
147 unsigned lid_bits;
148
149 BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4);
150 BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8);
151 BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16);
152 lid_bits = map->mode;
153
154 *cid = dest_id >> lid_bits;
155 *lid = dest_id & ((1 << lid_bits) - 1);
156}
157
136static void recalculate_apic_map(struct kvm *kvm) 158static void recalculate_apic_map(struct kvm *kvm)
137{ 159{
138 struct kvm_apic_map *new, *old = NULL; 160 struct kvm_apic_map *new, *old = NULL;
@@ -146,48 +168,6 @@ static void recalculate_apic_map(struct kvm *kvm)
146 if (!new) 168 if (!new)
147 goto out; 169 goto out;
148 170
149 new->ldr_bits = 8;
150 /* flat mode is default */
151 new->cid_shift = 8;
152 new->cid_mask = 0;
153 new->lid_mask = 0xff;
154 new->broadcast = APIC_BROADCAST;
155
156 kvm_for_each_vcpu(i, vcpu, kvm) {
157 struct kvm_lapic *apic = vcpu->arch.apic;
158
159 if (!kvm_apic_present(vcpu))
160 continue;
161
162 if (apic_x2apic_mode(apic)) {
163 new->ldr_bits = 32;
164 new->cid_shift = 16;
165 new->cid_mask = new->lid_mask = 0xffff;
166 new->broadcast = X2APIC_BROADCAST;
167 } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
168 if (kvm_apic_get_reg(apic, APIC_DFR) ==
169 APIC_DFR_CLUSTER) {
170 new->cid_shift = 4;
171 new->cid_mask = 0xf;
172 new->lid_mask = 0xf;
173 } else {
174 new->cid_shift = 8;
175 new->cid_mask = 0;
176 new->lid_mask = 0xff;
177 }
178 }
179
180 /*
181 * All APICs have to be configured in the same mode by an OS.
182 * We take advatage of this while building logical id loockup
183 * table. After reset APICs are in software disabled mode, so if
184 * we find apic with different setting we assume this is the mode
185 * OS wants all apics to be in; build lookup table accordingly.
186 */
187 if (kvm_apic_sw_enabled(apic))
188 break;
189 }
190
191 kvm_for_each_vcpu(i, vcpu, kvm) { 171 kvm_for_each_vcpu(i, vcpu, kvm) {
192 struct kvm_lapic *apic = vcpu->arch.apic; 172 struct kvm_lapic *apic = vcpu->arch.apic;
193 u16 cid, lid; 173 u16 cid, lid;
@@ -198,11 +178,25 @@ static void recalculate_apic_map(struct kvm *kvm)
198 178
199 aid = kvm_apic_id(apic); 179 aid = kvm_apic_id(apic);
200 ldr = kvm_apic_get_reg(apic, APIC_LDR); 180 ldr = kvm_apic_get_reg(apic, APIC_LDR);
201 cid = apic_cluster_id(new, ldr);
202 lid = apic_logical_id(new, ldr);
203 181
204 if (aid < ARRAY_SIZE(new->phys_map)) 182 if (aid < ARRAY_SIZE(new->phys_map))
205 new->phys_map[aid] = apic; 183 new->phys_map[aid] = apic;
184
185 if (apic_x2apic_mode(apic)) {
186 new->mode |= KVM_APIC_MODE_X2APIC;
187 } else if (ldr) {
188 ldr = GET_APIC_LOGICAL_ID(ldr);
189 if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
190 new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
191 else
192 new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
193 }
194
195 if (!kvm_apic_logical_map_valid(new))
196 continue;
197
198 apic_logical_id(new, ldr, &cid, &lid);
199
206 if (lid && cid < ARRAY_SIZE(new->logical_map)) 200 if (lid && cid < ARRAY_SIZE(new->logical_map))
207 new->logical_map[cid][ffs(lid) - 1] = apic; 201 new->logical_map[cid][ffs(lid) - 1] = apic;
208 } 202 }
@@ -588,15 +582,23 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
588 apic_update_ppr(apic); 582 apic_update_ppr(apic);
589} 583}
590 584
591static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) 585static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
592{ 586{
593 return dest == (apic_x2apic_mode(apic) ? 587 if (apic_x2apic_mode(apic))
594 X2APIC_BROADCAST : APIC_BROADCAST); 588 return mda == X2APIC_BROADCAST;
589
590 return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
595} 591}
596 592
597static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) 593static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
598{ 594{
599 return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); 595 if (kvm_apic_broadcast(apic, mda))
596 return true;
597
598 if (apic_x2apic_mode(apic))
599 return mda == kvm_apic_id(apic);
600
601 return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
600} 602}
601 603
602static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) 604static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -613,6 +615,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
613 && (logical_id & mda & 0xffff) != 0; 615 && (logical_id & mda & 0xffff) != 0;
614 616
615 logical_id = GET_APIC_LOGICAL_ID(logical_id); 617 logical_id = GET_APIC_LOGICAL_ID(logical_id);
618 mda = GET_APIC_DEST_FIELD(mda);
616 619
617 switch (kvm_apic_get_reg(apic, APIC_DFR)) { 620 switch (kvm_apic_get_reg(apic, APIC_DFR)) {
618 case APIC_DFR_FLAT: 621 case APIC_DFR_FLAT:
@@ -627,10 +630,27 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
627 } 630 }
628} 631}
629 632
633/* KVM APIC implementation has two quirks
634 * - dest always begins at 0 while xAPIC MDA has offset 24,
635 * - IOxAPIC messages have to be delivered (directly) to x2APIC.
636 */
637static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
638 struct kvm_lapic *target)
639{
640 bool ipi = source != NULL;
641 bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
642
643 if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
644 return X2APIC_BROADCAST;
645
646 return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
647}
648
630bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 649bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
631 int short_hand, unsigned int dest, int dest_mode) 650 int short_hand, unsigned int dest, int dest_mode)
632{ 651{
633 struct kvm_lapic *target = vcpu->arch.apic; 652 struct kvm_lapic *target = vcpu->arch.apic;
653 u32 mda = kvm_apic_mda(dest, source, target);
634 654
635 apic_debug("target %p, source %p, dest 0x%x, " 655 apic_debug("target %p, source %p, dest 0x%x, "
636 "dest_mode 0x%x, short_hand 0x%x\n", 656 "dest_mode 0x%x, short_hand 0x%x\n",
@@ -640,9 +660,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
640 switch (short_hand) { 660 switch (short_hand) {
641 case APIC_DEST_NOSHORT: 661 case APIC_DEST_NOSHORT:
642 if (dest_mode == APIC_DEST_PHYSICAL) 662 if (dest_mode == APIC_DEST_PHYSICAL)
643 return kvm_apic_match_physical_addr(target, dest); 663 return kvm_apic_match_physical_addr(target, mda);
644 else 664 else
645 return kvm_apic_match_logical_addr(target, dest); 665 return kvm_apic_match_logical_addr(target, mda);
646 case APIC_DEST_SELF: 666 case APIC_DEST_SELF:
647 return target == source; 667 return target == source;
648 case APIC_DEST_ALLINC: 668 case APIC_DEST_ALLINC:
@@ -664,6 +684,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
664 struct kvm_lapic **dst; 684 struct kvm_lapic **dst;
665 int i; 685 int i;
666 bool ret = false; 686 bool ret = false;
687 bool x2apic_ipi = src && apic_x2apic_mode(src);
667 688
668 *r = -1; 689 *r = -1;
669 690
@@ -675,15 +696,15 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
675 if (irq->shorthand) 696 if (irq->shorthand)
676 return false; 697 return false;
677 698
699 if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
700 return false;
701
678 rcu_read_lock(); 702 rcu_read_lock();
679 map = rcu_dereference(kvm->arch.apic_map); 703 map = rcu_dereference(kvm->arch.apic_map);
680 704
681 if (!map) 705 if (!map)
682 goto out; 706 goto out;
683 707
684 if (irq->dest_id == map->broadcast)
685 goto out;
686
687 ret = true; 708 ret = true;
688 709
689 if (irq->dest_mode == APIC_DEST_PHYSICAL) { 710 if (irq->dest_mode == APIC_DEST_PHYSICAL) {
@@ -692,16 +713,20 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
692 713
693 dst = &map->phys_map[irq->dest_id]; 714 dst = &map->phys_map[irq->dest_id];
694 } else { 715 } else {
695 u32 mda = irq->dest_id << (32 - map->ldr_bits); 716 u16 cid;
696 u16 cid = apic_cluster_id(map, mda); 717
718 if (!kvm_apic_logical_map_valid(map)) {
719 ret = false;
720 goto out;
721 }
722
723 apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
697 724
698 if (cid >= ARRAY_SIZE(map->logical_map)) 725 if (cid >= ARRAY_SIZE(map->logical_map))
699 goto out; 726 goto out;
700 727
701 dst = map->logical_map[cid]; 728 dst = map->logical_map[cid];
702 729
703 bitmap = apic_logical_id(map, mda);
704
705 if (irq->delivery_mode == APIC_DM_LOWEST) { 730 if (irq->delivery_mode == APIC_DM_LOWEST) {
706 int l = -1; 731 int l = -1;
707 for_each_set_bit(i, &bitmap, 16) { 732 for_each_set_bit(i, &bitmap, 16) {
@@ -1037,7 +1062,7 @@ static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
1037 addr < apic->base_address + LAPIC_MMIO_LENGTH; 1062 addr < apic->base_address + LAPIC_MMIO_LENGTH;
1038} 1063}
1039 1064
1040static int apic_mmio_read(struct kvm_io_device *this, 1065static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
1041 gpa_t address, int len, void *data) 1066 gpa_t address, int len, void *data)
1042{ 1067{
1043 struct kvm_lapic *apic = to_lapic(this); 1068 struct kvm_lapic *apic = to_lapic(this);
@@ -1357,7 +1382,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
1357 return ret; 1382 return ret;
1358} 1383}
1359 1384
1360static int apic_mmio_write(struct kvm_io_device *this, 1385static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
1361 gpa_t address, int len, const void *data) 1386 gpa_t address, int len, const void *data)
1362{ 1387{
1363 struct kvm_lapic *apic = to_lapic(this); 1388 struct kvm_lapic *apic = to_lapic(this);
@@ -1497,8 +1522,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1497 return; 1522 return;
1498 } 1523 }
1499 1524
1500 if (!kvm_vcpu_is_bsp(apic->vcpu))
1501 value &= ~MSR_IA32_APICBASE_BSP;
1502 vcpu->arch.apic_base = value; 1525 vcpu->arch.apic_base = value;
1503 1526
1504 /* update jump label if enable bit changes */ 1527 /* update jump label if enable bit changes */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 0bc6c656625b..9d28383fc1e7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -1,7 +1,7 @@
1#ifndef __KVM_X86_LAPIC_H 1#ifndef __KVM_X86_LAPIC_H
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include <kvm/iodev.h>
5 5
6#include <linux/kvm_host.h> 6#include <linux/kvm_host.h>
7 7
@@ -148,21 +148,6 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
148 return kvm_x86_ops->vm_has_apicv(kvm); 148 return kvm_x86_ops->vm_has_apicv(kvm);
149} 149}
150 150
151static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
152{
153 u16 cid;
154 ldr >>= 32 - map->ldr_bits;
155 cid = (ldr >> map->cid_shift) & map->cid_mask;
156
157 return cid;
158}
159
160static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
161{
162 ldr >>= (32 - map->ldr_bits);
163 return ldr & map->lid_mask;
164}
165
166static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) 151static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
167{ 152{
168 return vcpu->arch.apic->pending_events; 153 return vcpu->arch.apic->pending_events;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cee759299a35..146f295ee322 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4465,6 +4465,79 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
4465 kvm_flush_remote_tlbs(kvm); 4465 kvm_flush_remote_tlbs(kvm);
4466} 4466}
4467 4467
4468static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
4469 unsigned long *rmapp)
4470{
4471 u64 *sptep;
4472 struct rmap_iterator iter;
4473 int need_tlb_flush = 0;
4474 pfn_t pfn;
4475 struct kvm_mmu_page *sp;
4476
4477 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
4478 BUG_ON(!(*sptep & PT_PRESENT_MASK));
4479
4480 sp = page_header(__pa(sptep));
4481 pfn = spte_to_pfn(*sptep);
4482
4483 /*
4484 * Only EPT supported for now; otherwise, one would need to
4485 * find out efficiently whether the guest page tables are
4486 * also using huge pages.
4487 */
4488 if (sp->role.direct &&
4489 !kvm_is_reserved_pfn(pfn) &&
4490 PageTransCompound(pfn_to_page(pfn))) {
4491 drop_spte(kvm, sptep);
4492 sptep = rmap_get_first(*rmapp, &iter);
4493 need_tlb_flush = 1;
4494 } else
4495 sptep = rmap_get_next(&iter);
4496 }
4497
4498 return need_tlb_flush;
4499}
4500
4501void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
4502 struct kvm_memory_slot *memslot)
4503{
4504 bool flush = false;
4505 unsigned long *rmapp;
4506 unsigned long last_index, index;
4507 gfn_t gfn_start, gfn_end;
4508
4509 spin_lock(&kvm->mmu_lock);
4510
4511 gfn_start = memslot->base_gfn;
4512 gfn_end = memslot->base_gfn + memslot->npages - 1;
4513
4514 if (gfn_start >= gfn_end)
4515 goto out;
4516
4517 rmapp = memslot->arch.rmap[0];
4518 last_index = gfn_to_index(gfn_end, memslot->base_gfn,
4519 PT_PAGE_TABLE_LEVEL);
4520
4521 for (index = 0; index <= last_index; ++index, ++rmapp) {
4522 if (*rmapp)
4523 flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
4524
4525 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
4526 if (flush) {
4527 kvm_flush_remote_tlbs(kvm);
4528 flush = false;
4529 }
4530 cond_resched_lock(&kvm->mmu_lock);
4531 }
4532 }
4533
4534 if (flush)
4535 kvm_flush_remote_tlbs(kvm);
4536
4537out:
4538 spin_unlock(&kvm->mmu_lock);
4539}
4540
4468void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 4541void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
4469 struct kvm_memory_slot *memslot) 4542 struct kvm_memory_slot *memslot)
4470{ 4543{
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 8e6b7d869d2f..29fbf9dfdc54 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -38,7 +38,7 @@ static struct kvm_arch_event_perf_mapping {
38}; 38};
39 39
40/* mapping between fixed pmc index and arch_events array */ 40/* mapping between fixed pmc index and arch_events array */
41int fixed_pmc_events[] = {1, 0, 7}; 41static int fixed_pmc_events[] = {1, 0, 7};
42 42
43static bool pmc_is_gp(struct kvm_pmc *pmc) 43static bool pmc_is_gp(struct kvm_pmc *pmc)
44{ 44{
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cc618c882f90..ce741b8650f6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1261,7 +1261,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1261 1261
1262 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | 1262 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1263 MSR_IA32_APICBASE_ENABLE; 1263 MSR_IA32_APICBASE_ENABLE;
1264 if (kvm_vcpu_is_bsp(&svm->vcpu)) 1264 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1265 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 1265 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1266 1266
1267 svm_init_osvw(&svm->vcpu); 1267 svm_init_osvw(&svm->vcpu);
@@ -1929,14 +1929,12 @@ static int nop_on_interception(struct vcpu_svm *svm)
1929static int halt_interception(struct vcpu_svm *svm) 1929static int halt_interception(struct vcpu_svm *svm)
1930{ 1930{
1931 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1931 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1932 skip_emulated_instruction(&svm->vcpu);
1933 return kvm_emulate_halt(&svm->vcpu); 1932 return kvm_emulate_halt(&svm->vcpu);
1934} 1933}
1935 1934
1936static int vmmcall_interception(struct vcpu_svm *svm) 1935static int vmmcall_interception(struct vcpu_svm *svm)
1937{ 1936{
1938 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1937 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1939 skip_emulated_instruction(&svm->vcpu);
1940 kvm_emulate_hypercall(&svm->vcpu); 1938 kvm_emulate_hypercall(&svm->vcpu);
1941 return 1; 1939 return 1;
1942} 1940}
@@ -2757,11 +2755,11 @@ static int invlpga_interception(struct vcpu_svm *svm)
2757{ 2755{
2758 struct kvm_vcpu *vcpu = &svm->vcpu; 2756 struct kvm_vcpu *vcpu = &svm->vcpu;
2759 2757
2760 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], 2758 trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
2761 vcpu->arch.regs[VCPU_REGS_RAX]); 2759 kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2762 2760
2763 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 2761 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2764 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 2762 kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2765 2763
2766 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2764 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2767 skip_emulated_instruction(&svm->vcpu); 2765 skip_emulated_instruction(&svm->vcpu);
@@ -2770,12 +2768,18 @@ static int invlpga_interception(struct vcpu_svm *svm)
2770 2768
2771static int skinit_interception(struct vcpu_svm *svm) 2769static int skinit_interception(struct vcpu_svm *svm)
2772{ 2770{
2773 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]); 2771 trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2774 2772
2775 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2773 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2776 return 1; 2774 return 1;
2777} 2775}
2778 2776
2777static int wbinvd_interception(struct vcpu_svm *svm)
2778{
2779 kvm_emulate_wbinvd(&svm->vcpu);
2780 return 1;
2781}
2782
2779static int xsetbv_interception(struct vcpu_svm *svm) 2783static int xsetbv_interception(struct vcpu_svm *svm)
2780{ 2784{
2781 u64 new_bv = kvm_read_edx_eax(&svm->vcpu); 2785 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
@@ -2902,7 +2906,8 @@ static int rdpmc_interception(struct vcpu_svm *svm)
2902 return 1; 2906 return 1;
2903} 2907}
2904 2908
2905bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) 2909static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
2910 unsigned long val)
2906{ 2911{
2907 unsigned long cr0 = svm->vcpu.arch.cr0; 2912 unsigned long cr0 = svm->vcpu.arch.cr0;
2908 bool ret = false; 2913 bool ret = false;
@@ -2940,7 +2945,10 @@ static int cr_interception(struct vcpu_svm *svm)
2940 return emulate_on_interception(svm); 2945 return emulate_on_interception(svm);
2941 2946
2942 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2947 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2943 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 2948 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2949 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2950 else
2951 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2944 2952
2945 err = 0; 2953 err = 0;
2946 if (cr >= 16) { /* mov to cr */ 2954 if (cr >= 16) { /* mov to cr */
@@ -3133,7 +3141,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
3133 3141
3134static int rdmsr_interception(struct vcpu_svm *svm) 3142static int rdmsr_interception(struct vcpu_svm *svm)
3135{ 3143{
3136 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3144 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3137 u64 data; 3145 u64 data;
3138 3146
3139 if (svm_get_msr(&svm->vcpu, ecx, &data)) { 3147 if (svm_get_msr(&svm->vcpu, ecx, &data)) {
@@ -3142,8 +3150,8 @@ static int rdmsr_interception(struct vcpu_svm *svm)
3142 } else { 3150 } else {
3143 trace_kvm_msr_read(ecx, data); 3151 trace_kvm_msr_read(ecx, data);
3144 3152
3145 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 3153 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
3146 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 3154 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
3147 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3155 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3148 skip_emulated_instruction(&svm->vcpu); 3156 skip_emulated_instruction(&svm->vcpu);
3149 } 3157 }
@@ -3246,9 +3254,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3246static int wrmsr_interception(struct vcpu_svm *svm) 3254static int wrmsr_interception(struct vcpu_svm *svm)
3247{ 3255{
3248 struct msr_data msr; 3256 struct msr_data msr;
3249 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3257 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3250 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 3258 u64 data = kvm_read_edx_eax(&svm->vcpu);
3251 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3252 3259
3253 msr.data = data; 3260 msr.data = data;
3254 msr.index = ecx; 3261 msr.index = ecx;
@@ -3325,7 +3332,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3325 [SVM_EXIT_READ_CR3] = cr_interception, 3332 [SVM_EXIT_READ_CR3] = cr_interception,
3326 [SVM_EXIT_READ_CR4] = cr_interception, 3333 [SVM_EXIT_READ_CR4] = cr_interception,
3327 [SVM_EXIT_READ_CR8] = cr_interception, 3334 [SVM_EXIT_READ_CR8] = cr_interception,
3328 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3335 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
3329 [SVM_EXIT_WRITE_CR0] = cr_interception, 3336 [SVM_EXIT_WRITE_CR0] = cr_interception,
3330 [SVM_EXIT_WRITE_CR3] = cr_interception, 3337 [SVM_EXIT_WRITE_CR3] = cr_interception,
3331 [SVM_EXIT_WRITE_CR4] = cr_interception, 3338 [SVM_EXIT_WRITE_CR4] = cr_interception,
@@ -3376,7 +3383,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3376 [SVM_EXIT_STGI] = stgi_interception, 3383 [SVM_EXIT_STGI] = stgi_interception,
3377 [SVM_EXIT_CLGI] = clgi_interception, 3384 [SVM_EXIT_CLGI] = clgi_interception,
3378 [SVM_EXIT_SKINIT] = skinit_interception, 3385 [SVM_EXIT_SKINIT] = skinit_interception,
3379 [SVM_EXIT_WBINVD] = emulate_on_interception, 3386 [SVM_EXIT_WBINVD] = wbinvd_interception,
3380 [SVM_EXIT_MONITOR] = monitor_interception, 3387 [SVM_EXIT_MONITOR] = monitor_interception,
3381 [SVM_EXIT_MWAIT] = mwait_interception, 3388 [SVM_EXIT_MWAIT] = mwait_interception,
3382 [SVM_EXIT_XSETBV] = xsetbv_interception, 3389 [SVM_EXIT_XSETBV] = xsetbv_interception,
@@ -3555,7 +3562,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
3555 3562
3556 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 3563 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3557 || !svm_exit_handlers[exit_code]) { 3564 || !svm_exit_handlers[exit_code]) {
3558 WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); 3565 WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
3559 kvm_queue_exception(vcpu, UD_VECTOR); 3566 kvm_queue_exception(vcpu, UD_VECTOR);
3560 return 1; 3567 return 1;
3561 } 3568 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ae4f6d35d19c..f5e8dce8046c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2470,6 +2470,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2470 vmx->nested.nested_vmx_secondary_ctls_low = 0; 2470 vmx->nested.nested_vmx_secondary_ctls_low = 0;
2471 vmx->nested.nested_vmx_secondary_ctls_high &= 2471 vmx->nested.nested_vmx_secondary_ctls_high &=
2472 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2472 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2473 SECONDARY_EXEC_RDTSCP |
2473 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2474 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2474 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2475 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2475 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2476 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -3268,8 +3269,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3268 * default value. 3269 * default value.
3269 */ 3270 */
3270 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3271 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3271 save->selector &= ~SELECTOR_RPL_MASK; 3272 save->selector &= ~SEGMENT_RPL_MASK;
3272 save->dpl = save->selector & SELECTOR_RPL_MASK; 3273 save->dpl = save->selector & SEGMENT_RPL_MASK;
3273 save->s = 1; 3274 save->s = 1;
3274 } 3275 }
3275 vmx_set_segment(vcpu, save, seg); 3276 vmx_set_segment(vcpu, save, seg);
@@ -3842,7 +3843,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
3842 unsigned int cs_rpl; 3843 unsigned int cs_rpl;
3843 3844
3844 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3845 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3845 cs_rpl = cs.selector & SELECTOR_RPL_MASK; 3846 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3846 3847
3847 if (cs.unusable) 3848 if (cs.unusable)
3848 return false; 3849 return false;
@@ -3870,7 +3871,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3870 unsigned int ss_rpl; 3871 unsigned int ss_rpl;
3871 3872
3872 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3873 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3873 ss_rpl = ss.selector & SELECTOR_RPL_MASK; 3874 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3874 3875
3875 if (ss.unusable) 3876 if (ss.unusable)
3876 return true; 3877 return true;
@@ -3892,7 +3893,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3892 unsigned int rpl; 3893 unsigned int rpl;
3893 3894
3894 vmx_get_segment(vcpu, &var, seg); 3895 vmx_get_segment(vcpu, &var, seg);
3895 rpl = var.selector & SELECTOR_RPL_MASK; 3896 rpl = var.selector & SEGMENT_RPL_MASK;
3896 3897
3897 if (var.unusable) 3898 if (var.unusable)
3898 return true; 3899 return true;
@@ -3919,7 +3920,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
3919 3920
3920 if (tr.unusable) 3921 if (tr.unusable)
3921 return false; 3922 return false;
3922 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ 3923 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3923 return false; 3924 return false;
3924 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3925 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3925 return false; 3926 return false;
@@ -3937,7 +3938,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
3937 3938
3938 if (ldtr.unusable) 3939 if (ldtr.unusable)
3939 return true; 3940 return true;
3940 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ 3941 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3941 return false; 3942 return false;
3942 if (ldtr.type != 2) 3943 if (ldtr.type != 2)
3943 return false; 3944 return false;
@@ -3954,8 +3955,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3954 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3955 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3955 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3956 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3956 3957
3957 return ((cs.selector & SELECTOR_RPL_MASK) == 3958 return ((cs.selector & SEGMENT_RPL_MASK) ==
3958 (ss.selector & SELECTOR_RPL_MASK)); 3959 (ss.selector & SEGMENT_RPL_MASK));
3959} 3960}
3960 3961
3961/* 3962/*
@@ -4711,7 +4712,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4711 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4712 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4712 kvm_set_cr8(&vmx->vcpu, 0); 4713 kvm_set_cr8(&vmx->vcpu, 0);
4713 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; 4714 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
4714 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4715 if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
4715 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4716 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4716 apic_base_msr.host_initiated = true; 4717 apic_base_msr.host_initiated = true;
4717 kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); 4718 kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
@@ -5006,7 +5007,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5006 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { 5007 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
5007 if (vcpu->arch.halt_request) { 5008 if (vcpu->arch.halt_request) {
5008 vcpu->arch.halt_request = 0; 5009 vcpu->arch.halt_request = 0;
5009 return kvm_emulate_halt(vcpu); 5010 return kvm_vcpu_halt(vcpu);
5010 } 5011 }
5011 return 1; 5012 return 1;
5012 } 5013 }
@@ -5071,6 +5072,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
5071 } 5072 }
5072 5073
5073 if (is_invalid_opcode(intr_info)) { 5074 if (is_invalid_opcode(intr_info)) {
5075 if (is_guest_mode(vcpu)) {
5076 kvm_queue_exception(vcpu, UD_VECTOR);
5077 return 1;
5078 }
5074 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); 5079 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
5075 if (er != EMULATE_DONE) 5080 if (er != EMULATE_DONE)
5076 kvm_queue_exception(vcpu, UD_VECTOR); 5081 kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5090,9 +5095,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
5090 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5095 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5091 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5096 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5092 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5097 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5093 vcpu->run->internal.ndata = 2; 5098 vcpu->run->internal.ndata = 3;
5094 vcpu->run->internal.data[0] = vect_info; 5099 vcpu->run->internal.data[0] = vect_info;
5095 vcpu->run->internal.data[1] = intr_info; 5100 vcpu->run->internal.data[1] = intr_info;
5101 vcpu->run->internal.data[2] = error_code;
5096 return 0; 5102 return 0;
5097 } 5103 }
5098 5104
@@ -5533,13 +5539,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5533 5539
5534static int handle_halt(struct kvm_vcpu *vcpu) 5540static int handle_halt(struct kvm_vcpu *vcpu)
5535{ 5541{
5536 skip_emulated_instruction(vcpu);
5537 return kvm_emulate_halt(vcpu); 5542 return kvm_emulate_halt(vcpu);
5538} 5543}
5539 5544
5540static int handle_vmcall(struct kvm_vcpu *vcpu) 5545static int handle_vmcall(struct kvm_vcpu *vcpu)
5541{ 5546{
5542 skip_emulated_instruction(vcpu);
5543 kvm_emulate_hypercall(vcpu); 5547 kvm_emulate_hypercall(vcpu);
5544 return 1; 5548 return 1;
5545} 5549}
@@ -5570,7 +5574,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
5570 5574
5571static int handle_wbinvd(struct kvm_vcpu *vcpu) 5575static int handle_wbinvd(struct kvm_vcpu *vcpu)
5572{ 5576{
5573 skip_emulated_instruction(vcpu);
5574 kvm_emulate_wbinvd(vcpu); 5577 kvm_emulate_wbinvd(vcpu);
5575 return 1; 5578 return 1;
5576} 5579}
@@ -5828,7 +5831,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5828 gpa_t gpa; 5831 gpa_t gpa;
5829 5832
5830 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5833 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5831 if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5834 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5832 skip_emulated_instruction(vcpu); 5835 skip_emulated_instruction(vcpu);
5833 return 1; 5836 return 1;
5834 } 5837 }
@@ -5909,7 +5912,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5909 5912
5910 if (vcpu->arch.halt_request) { 5913 if (vcpu->arch.halt_request) {
5911 vcpu->arch.halt_request = 0; 5914 vcpu->arch.halt_request = 0;
5912 ret = kvm_emulate_halt(vcpu); 5915 ret = kvm_vcpu_halt(vcpu);
5913 goto out; 5916 goto out;
5914 } 5917 }
5915 5918
@@ -7318,21 +7321,21 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
7318 else if (port < 0x10000) 7321 else if (port < 0x10000)
7319 bitmap = vmcs12->io_bitmap_b; 7322 bitmap = vmcs12->io_bitmap_b;
7320 else 7323 else
7321 return 1; 7324 return true;
7322 bitmap += (port & 0x7fff) / 8; 7325 bitmap += (port & 0x7fff) / 8;
7323 7326
7324 if (last_bitmap != bitmap) 7327 if (last_bitmap != bitmap)
7325 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) 7328 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
7326 return 1; 7329 return true;
7327 if (b & (1 << (port & 7))) 7330 if (b & (1 << (port & 7)))
7328 return 1; 7331 return true;
7329 7332
7330 port++; 7333 port++;
7331 size--; 7334 size--;
7332 last_bitmap = bitmap; 7335 last_bitmap = bitmap;
7333 } 7336 }
7334 7337
7335 return 0; 7338 return false;
7336} 7339}
7337 7340
7338/* 7341/*
@@ -7348,7 +7351,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
7348 gpa_t bitmap; 7351 gpa_t bitmap;
7349 7352
7350 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 7353 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
7351 return 1; 7354 return true;
7352 7355
7353 /* 7356 /*
7354 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 7357 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
@@ -7367,10 +7370,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
7367 if (msr_index < 1024*8) { 7370 if (msr_index < 1024*8) {
7368 unsigned char b; 7371 unsigned char b;
7369 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) 7372 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
7370 return 1; 7373 return true;
7371 return 1 & (b >> (msr_index & 7)); 7374 return 1 & (b >> (msr_index & 7));
7372 } else 7375 } else
7373 return 1; /* let L1 handle the wrong parameter */ 7376 return true; /* let L1 handle the wrong parameter */
7374} 7377}
7375 7378
7376/* 7379/*
@@ -7392,7 +7395,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
7392 case 0: 7395 case 0:
7393 if (vmcs12->cr0_guest_host_mask & 7396 if (vmcs12->cr0_guest_host_mask &
7394 (val ^ vmcs12->cr0_read_shadow)) 7397 (val ^ vmcs12->cr0_read_shadow))
7395 return 1; 7398 return true;
7396 break; 7399 break;
7397 case 3: 7400 case 3:
7398 if ((vmcs12->cr3_target_count >= 1 && 7401 if ((vmcs12->cr3_target_count >= 1 &&
@@ -7403,37 +7406,37 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
7403 vmcs12->cr3_target_value2 == val) || 7406 vmcs12->cr3_target_value2 == val) ||
7404 (vmcs12->cr3_target_count >= 4 && 7407 (vmcs12->cr3_target_count >= 4 &&
7405 vmcs12->cr3_target_value3 == val)) 7408 vmcs12->cr3_target_value3 == val))
7406 return 0; 7409 return false;
7407 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 7410 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
7408 return 1; 7411 return true;
7409 break; 7412 break;
7410 case 4: 7413 case 4:
7411 if (vmcs12->cr4_guest_host_mask & 7414 if (vmcs12->cr4_guest_host_mask &
7412 (vmcs12->cr4_read_shadow ^ val)) 7415 (vmcs12->cr4_read_shadow ^ val))
7413 return 1; 7416 return true;
7414 break; 7417 break;
7415 case 8: 7418 case 8:
7416 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 7419 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
7417 return 1; 7420 return true;
7418 break; 7421 break;
7419 } 7422 }
7420 break; 7423 break;
7421 case 2: /* clts */ 7424 case 2: /* clts */
7422 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 7425 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
7423 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 7426 (vmcs12->cr0_read_shadow & X86_CR0_TS))
7424 return 1; 7427 return true;
7425 break; 7428 break;
7426 case 1: /* mov from cr */ 7429 case 1: /* mov from cr */
7427 switch (cr) { 7430 switch (cr) {
7428 case 3: 7431 case 3:
7429 if (vmcs12->cpu_based_vm_exec_control & 7432 if (vmcs12->cpu_based_vm_exec_control &
7430 CPU_BASED_CR3_STORE_EXITING) 7433 CPU_BASED_CR3_STORE_EXITING)
7431 return 1; 7434 return true;
7432 break; 7435 break;
7433 case 8: 7436 case 8:
7434 if (vmcs12->cpu_based_vm_exec_control & 7437 if (vmcs12->cpu_based_vm_exec_control &
7435 CPU_BASED_CR8_STORE_EXITING) 7438 CPU_BASED_CR8_STORE_EXITING)
7436 return 1; 7439 return true;
7437 break; 7440 break;
7438 } 7441 }
7439 break; 7442 break;
@@ -7444,14 +7447,14 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
7444 */ 7447 */
7445 if (vmcs12->cr0_guest_host_mask & 0xe & 7448 if (vmcs12->cr0_guest_host_mask & 0xe &
7446 (val ^ vmcs12->cr0_read_shadow)) 7449 (val ^ vmcs12->cr0_read_shadow))
7447 return 1; 7450 return true;
7448 if ((vmcs12->cr0_guest_host_mask & 0x1) && 7451 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
7449 !(vmcs12->cr0_read_shadow & 0x1) && 7452 !(vmcs12->cr0_read_shadow & 0x1) &&
7450 (val & 0x1)) 7453 (val & 0x1))
7451 return 1; 7454 return true;
7452 break; 7455 break;
7453 } 7456 }
7454 return 0; 7457 return false;
7455} 7458}
7456 7459
7457/* 7460/*
@@ -7474,48 +7477,48 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7474 KVM_ISA_VMX); 7477 KVM_ISA_VMX);
7475 7478
7476 if (vmx->nested.nested_run_pending) 7479 if (vmx->nested.nested_run_pending)
7477 return 0; 7480 return false;
7478 7481
7479 if (unlikely(vmx->fail)) { 7482 if (unlikely(vmx->fail)) {
7480 pr_info_ratelimited("%s failed vm entry %x\n", __func__, 7483 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
7481 vmcs_read32(VM_INSTRUCTION_ERROR)); 7484 vmcs_read32(VM_INSTRUCTION_ERROR));
7482 return 1; 7485 return true;
7483 } 7486 }
7484 7487
7485 switch (exit_reason) { 7488 switch (exit_reason) {
7486 case EXIT_REASON_EXCEPTION_NMI: 7489 case EXIT_REASON_EXCEPTION_NMI:
7487 if (!is_exception(intr_info)) 7490 if (!is_exception(intr_info))
7488 return 0; 7491 return false;
7489 else if (is_page_fault(intr_info)) 7492 else if (is_page_fault(intr_info))
7490 return enable_ept; 7493 return enable_ept;
7491 else if (is_no_device(intr_info) && 7494 else if (is_no_device(intr_info) &&
7492 !(vmcs12->guest_cr0 & X86_CR0_TS)) 7495 !(vmcs12->guest_cr0 & X86_CR0_TS))
7493 return 0; 7496 return false;
7494 return vmcs12->exception_bitmap & 7497 return vmcs12->exception_bitmap &
7495 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 7498 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
7496 case EXIT_REASON_EXTERNAL_INTERRUPT: 7499 case EXIT_REASON_EXTERNAL_INTERRUPT:
7497 return 0; 7500 return false;
7498 case EXIT_REASON_TRIPLE_FAULT: 7501 case EXIT_REASON_TRIPLE_FAULT:
7499 return 1; 7502 return true;
7500 case EXIT_REASON_PENDING_INTERRUPT: 7503 case EXIT_REASON_PENDING_INTERRUPT:
7501 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 7504 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
7502 case EXIT_REASON_NMI_WINDOW: 7505 case EXIT_REASON_NMI_WINDOW:
7503 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 7506 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
7504 case EXIT_REASON_TASK_SWITCH: 7507 case EXIT_REASON_TASK_SWITCH:
7505 return 1; 7508 return true;
7506 case EXIT_REASON_CPUID: 7509 case EXIT_REASON_CPUID:
7507 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) 7510 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
7508 return 0; 7511 return false;
7509 return 1; 7512 return true;
7510 case EXIT_REASON_HLT: 7513 case EXIT_REASON_HLT:
7511 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 7514 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
7512 case EXIT_REASON_INVD: 7515 case EXIT_REASON_INVD:
7513 return 1; 7516 return true;
7514 case EXIT_REASON_INVLPG: 7517 case EXIT_REASON_INVLPG:
7515 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 7518 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
7516 case EXIT_REASON_RDPMC: 7519 case EXIT_REASON_RDPMC:
7517 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 7520 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
7518 case EXIT_REASON_RDTSC: 7521 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
7519 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 7522 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
7520 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 7523 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
7521 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 7524 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
@@ -7527,7 +7530,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7527 * VMX instructions trap unconditionally. This allows L1 to 7530 * VMX instructions trap unconditionally. This allows L1 to
7528 * emulate them for its L2 guest, i.e., allows 3-level nesting! 7531 * emulate them for its L2 guest, i.e., allows 3-level nesting!
7529 */ 7532 */
7530 return 1; 7533 return true;
7531 case EXIT_REASON_CR_ACCESS: 7534 case EXIT_REASON_CR_ACCESS:
7532 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 7535 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
7533 case EXIT_REASON_DR_ACCESS: 7536 case EXIT_REASON_DR_ACCESS:
@@ -7538,7 +7541,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7538 case EXIT_REASON_MSR_WRITE: 7541 case EXIT_REASON_MSR_WRITE:
7539 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 7542 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
7540 case EXIT_REASON_INVALID_STATE: 7543 case EXIT_REASON_INVALID_STATE:
7541 return 1; 7544 return true;
7542 case EXIT_REASON_MWAIT_INSTRUCTION: 7545 case EXIT_REASON_MWAIT_INSTRUCTION:
7543 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 7546 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
7544 case EXIT_REASON_MONITOR_INSTRUCTION: 7547 case EXIT_REASON_MONITOR_INSTRUCTION:
@@ -7548,7 +7551,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7548 nested_cpu_has2(vmcs12, 7551 nested_cpu_has2(vmcs12,
7549 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 7552 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
7550 case EXIT_REASON_MCE_DURING_VMENTRY: 7553 case EXIT_REASON_MCE_DURING_VMENTRY:
7551 return 0; 7554 return false;
7552 case EXIT_REASON_TPR_BELOW_THRESHOLD: 7555 case EXIT_REASON_TPR_BELOW_THRESHOLD:
7553 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 7556 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
7554 case EXIT_REASON_APIC_ACCESS: 7557 case EXIT_REASON_APIC_ACCESS:
@@ -7557,7 +7560,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7557 case EXIT_REASON_APIC_WRITE: 7560 case EXIT_REASON_APIC_WRITE:
7558 case EXIT_REASON_EOI_INDUCED: 7561 case EXIT_REASON_EOI_INDUCED:
7559 /* apic_write and eoi_induced should exit unconditionally. */ 7562 /* apic_write and eoi_induced should exit unconditionally. */
7560 return 1; 7563 return true;
7561 case EXIT_REASON_EPT_VIOLATION: 7564 case EXIT_REASON_EPT_VIOLATION:
7562 /* 7565 /*
7563 * L0 always deals with the EPT violation. If nested EPT is 7566 * L0 always deals with the EPT violation. If nested EPT is
@@ -7565,7 +7568,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7565 * missing in the guest EPT table (EPT12), the EPT violation 7568 * missing in the guest EPT table (EPT12), the EPT violation
7566 * will be injected with nested_ept_inject_page_fault() 7569 * will be injected with nested_ept_inject_page_fault()
7567 */ 7570 */
7568 return 0; 7571 return false;
7569 case EXIT_REASON_EPT_MISCONFIG: 7572 case EXIT_REASON_EPT_MISCONFIG:
7570 /* 7573 /*
7571 * L2 never uses directly L1's EPT, but rather L0's own EPT 7574 * L2 never uses directly L1's EPT, but rather L0's own EPT
@@ -7573,11 +7576,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7573 * (EPT on EPT). So any problems with the structure of the 7576 * (EPT on EPT). So any problems with the structure of the
7574 * table is L0's fault. 7577 * table is L0's fault.
7575 */ 7578 */
7576 return 0; 7579 return false;
7577 case EXIT_REASON_WBINVD: 7580 case EXIT_REASON_WBINVD:
7578 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 7581 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
7579 case EXIT_REASON_XSETBV: 7582 case EXIT_REASON_XSETBV:
7580 return 1; 7583 return true;
7581 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 7584 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
7582 /* 7585 /*
7583 * This should never happen, since it is not possible to 7586 * This should never happen, since it is not possible to
@@ -7587,7 +7590,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7587 */ 7590 */
7588 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 7591 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
7589 default: 7592 default:
7590 return 1; 7593 return true;
7591 } 7594 }
7592} 7595}
7593 7596
@@ -8522,6 +8525,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
8522 exec_control); 8525 exec_control);
8523 } 8526 }
8524 } 8527 }
8528 if (nested && !vmx->rdtscp_enabled)
8529 vmx->nested.nested_vmx_secondary_ctls_high &=
8530 ~SECONDARY_EXEC_RDTSCP;
8525 } 8531 }
8526 8532
8527 /* Exposing INVPCID only when PCID is exposed */ 8533 /* Exposing INVPCID only when PCID is exposed */
@@ -8622,10 +8628,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8622 struct vmcs12 *vmcs12) 8628 struct vmcs12 *vmcs12)
8623{ 8629{
8624 struct vcpu_vmx *vmx = to_vmx(vcpu); 8630 struct vcpu_vmx *vmx = to_vmx(vcpu);
8631 int maxphyaddr = cpuid_maxphyaddr(vcpu);
8625 8632
8626 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 8633 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
8627 /* TODO: Also verify bits beyond physical address width are 0 */ 8634 if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
8628 if (!PAGE_ALIGNED(vmcs12->apic_access_addr)) 8635 vmcs12->apic_access_addr >> maxphyaddr)
8629 return false; 8636 return false;
8630 8637
8631 /* 8638 /*
@@ -8641,8 +8648,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8641 } 8648 }
8642 8649
8643 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 8650 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
8644 /* TODO: Also verify bits beyond physical address width are 0 */ 8651 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
8645 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr)) 8652 vmcs12->virtual_apic_page_addr >> maxphyaddr)
8646 return false; 8653 return false;
8647 8654
8648 if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 8655 if (vmx->nested.virtual_apic_page) /* shouldn't happen */
@@ -8665,7 +8672,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8665 } 8672 }
8666 8673
8667 if (nested_cpu_has_posted_intr(vmcs12)) { 8674 if (nested_cpu_has_posted_intr(vmcs12)) {
8668 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64)) 8675 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
8676 vmcs12->posted_intr_desc_addr >> maxphyaddr)
8669 return false; 8677 return false;
8670 8678
8671 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 8679 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
@@ -8864,9 +8872,9 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
8864 8872
8865static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 8873static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8866 unsigned long count_field, 8874 unsigned long count_field,
8867 unsigned long addr_field, 8875 unsigned long addr_field)
8868 int maxphyaddr)
8869{ 8876{
8877 int maxphyaddr;
8870 u64 count, addr; 8878 u64 count, addr;
8871 8879
8872 if (vmcs12_read_any(vcpu, count_field, &count) || 8880 if (vmcs12_read_any(vcpu, count_field, &count) ||
@@ -8876,6 +8884,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8876 } 8884 }
8877 if (count == 0) 8885 if (count == 0)
8878 return 0; 8886 return 0;
8887 maxphyaddr = cpuid_maxphyaddr(vcpu);
8879 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 8888 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
8880 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { 8889 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
8881 pr_warn_ratelimited( 8890 pr_warn_ratelimited(
@@ -8889,19 +8898,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8889static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, 8898static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
8890 struct vmcs12 *vmcs12) 8899 struct vmcs12 *vmcs12)
8891{ 8900{
8892 int maxphyaddr;
8893
8894 if (vmcs12->vm_exit_msr_load_count == 0 && 8901 if (vmcs12->vm_exit_msr_load_count == 0 &&
8895 vmcs12->vm_exit_msr_store_count == 0 && 8902 vmcs12->vm_exit_msr_store_count == 0 &&
8896 vmcs12->vm_entry_msr_load_count == 0) 8903 vmcs12->vm_entry_msr_load_count == 0)
8897 return 0; /* Fast path */ 8904 return 0; /* Fast path */
8898 maxphyaddr = cpuid_maxphyaddr(vcpu);
8899 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, 8905 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
8900 VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || 8906 VM_EXIT_MSR_LOAD_ADDR) ||
8901 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, 8907 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
8902 VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || 8908 VM_EXIT_MSR_STORE_ADDR) ||
8903 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, 8909 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
8904 VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) 8910 VM_ENTRY_MSR_LOAD_ADDR))
8905 return -EINVAL; 8911 return -EINVAL;
8906 return 0; 8912 return 0;
8907} 8913}
@@ -9151,8 +9157,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9151 exec_control &= ~SECONDARY_EXEC_RDTSCP; 9157 exec_control &= ~SECONDARY_EXEC_RDTSCP;
9152 /* Take the following fields only from vmcs12 */ 9158 /* Take the following fields only from vmcs12 */
9153 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 9159 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
9160 SECONDARY_EXEC_RDTSCP |
9154 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 9161 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
9155 SECONDARY_EXEC_APIC_REGISTER_VIRT); 9162 SECONDARY_EXEC_APIC_REGISTER_VIRT);
9156 if (nested_cpu_has(vmcs12, 9163 if (nested_cpu_has(vmcs12,
9157 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 9164 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
9158 exec_control |= vmcs12->secondary_vm_exec_control; 9165 exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9385,7 +9392,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
9385 } 9392 }
9386 9393
9387 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 9394 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
9388 /*TODO: Also verify bits beyond physical address width are 0*/
9389 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9395 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9390 return 1; 9396 return 1;
9391 } 9397 }
@@ -9524,7 +9530,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
9524 vmcs12->launch_state = 1; 9530 vmcs12->launch_state = 1;
9525 9531
9526 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) 9532 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
9527 return kvm_emulate_halt(vcpu); 9533 return kvm_vcpu_halt(vcpu);
9528 9534
9529 vmx->nested.nested_run_pending = 1; 9535 vmx->nested.nested_run_pending = 1;
9530 9536
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ee725f1896d..e1a81267f3f6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -801,6 +801,17 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
801} 801}
802EXPORT_SYMBOL_GPL(kvm_get_cr8); 802EXPORT_SYMBOL_GPL(kvm_get_cr8);
803 803
804static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
805{
806 int i;
807
808 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
809 for (i = 0; i < KVM_NR_DB_REGS; i++)
810 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
811 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
812 }
813}
814
804static void kvm_update_dr6(struct kvm_vcpu *vcpu) 815static void kvm_update_dr6(struct kvm_vcpu *vcpu)
805{ 816{
806 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 817 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
@@ -3149,6 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3149 return -EINVAL; 3160 return -EINVAL;
3150 3161
3151 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 3162 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3163 kvm_update_dr0123(vcpu);
3152 vcpu->arch.dr6 = dbgregs->dr6; 3164 vcpu->arch.dr6 = dbgregs->dr6;
3153 kvm_update_dr6(vcpu); 3165 kvm_update_dr6(vcpu);
3154 vcpu->arch.dr7 = dbgregs->dr7; 3166 vcpu->arch.dr7 = dbgregs->dr7;
@@ -4114,8 +4126,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
4114 do { 4126 do {
4115 n = min(len, 8); 4127 n = min(len, 8);
4116 if (!(vcpu->arch.apic && 4128 if (!(vcpu->arch.apic &&
4117 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) 4129 !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
4118 && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) 4130 && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
4119 break; 4131 break;
4120 handled += n; 4132 handled += n;
4121 addr += n; 4133 addr += n;
@@ -4134,8 +4146,9 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
4134 do { 4146 do {
4135 n = min(len, 8); 4147 n = min(len, 8);
4136 if (!(vcpu->arch.apic && 4148 if (!(vcpu->arch.apic &&
4137 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) 4149 !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
4138 && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) 4150 addr, n, v))
4151 && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
4139 break; 4152 break;
4140 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); 4153 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
4141 handled += n; 4154 handled += n;
@@ -4475,7 +4488,8 @@ mmio:
4475 return X86EMUL_CONTINUE; 4488 return X86EMUL_CONTINUE;
4476} 4489}
4477 4490
4478int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 4491static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
4492 unsigned long addr,
4479 void *val, unsigned int bytes, 4493 void *val, unsigned int bytes,
4480 struct x86_exception *exception, 4494 struct x86_exception *exception,
4481 const struct read_write_emulator_ops *ops) 4495 const struct read_write_emulator_ops *ops)
@@ -4538,7 +4552,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4538 exception, &read_emultor); 4552 exception, &read_emultor);
4539} 4553}
4540 4554
4541int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, 4555static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4542 unsigned long addr, 4556 unsigned long addr,
4543 const void *val, 4557 const void *val,
4544 unsigned int bytes, 4558 unsigned int bytes,
@@ -4629,10 +4643,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4629 int r; 4643 int r;
4630 4644
4631 if (vcpu->arch.pio.in) 4645 if (vcpu->arch.pio.in)
4632 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, 4646 r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
4633 vcpu->arch.pio.size, pd); 4647 vcpu->arch.pio.size, pd);
4634 else 4648 else
4635 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, 4649 r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
4636 vcpu->arch.pio.port, vcpu->arch.pio.size, 4650 vcpu->arch.pio.port, vcpu->arch.pio.size,
4637 pd); 4651 pd);
4638 return r; 4652 return r;
@@ -4705,7 +4719,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4705 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); 4719 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4706} 4720}
4707 4721
4708int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 4722int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
4709{ 4723{
4710 if (!need_emulate_wbinvd(vcpu)) 4724 if (!need_emulate_wbinvd(vcpu))
4711 return X86EMUL_CONTINUE; 4725 return X86EMUL_CONTINUE;
@@ -4722,19 +4736,29 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4722 wbinvd(); 4736 wbinvd();
4723 return X86EMUL_CONTINUE; 4737 return X86EMUL_CONTINUE;
4724} 4738}
4739
4740int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4741{
4742 kvm_x86_ops->skip_emulated_instruction(vcpu);
4743 return kvm_emulate_wbinvd_noskip(vcpu);
4744}
4725EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4745EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4726 4746
4747
4748
4727static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) 4749static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4728{ 4750{
4729 kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); 4751 kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
4730} 4752}
4731 4753
4732int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 4754static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
4755 unsigned long *dest)
4733{ 4756{
4734 return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); 4757 return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4735} 4758}
4736 4759
4737int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 4760static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
4761 unsigned long value)
4738{ 4762{
4739 4763
4740 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); 4764 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
@@ -5816,7 +5840,7 @@ void kvm_arch_exit(void)
5816 free_percpu(shared_msrs); 5840 free_percpu(shared_msrs);
5817} 5841}
5818 5842
5819int kvm_emulate_halt(struct kvm_vcpu *vcpu) 5843int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
5820{ 5844{
5821 ++vcpu->stat.halt_exits; 5845 ++vcpu->stat.halt_exits;
5822 if (irqchip_in_kernel(vcpu->kvm)) { 5846 if (irqchip_in_kernel(vcpu->kvm)) {
@@ -5827,6 +5851,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5827 return 0; 5851 return 0;
5828 } 5852 }
5829} 5853}
5854EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
5855
5856int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5857{
5858 kvm_x86_ops->skip_emulated_instruction(vcpu);
5859 return kvm_vcpu_halt(vcpu);
5860}
5830EXPORT_SYMBOL_GPL(kvm_emulate_halt); 5861EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5831 5862
5832int kvm_hv_hypercall(struct kvm_vcpu *vcpu) 5863int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
@@ -5903,7 +5934,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5903 lapic_irq.dest_id = apicid; 5934 lapic_irq.dest_id = apicid;
5904 5935
5905 lapic_irq.delivery_mode = APIC_DM_REMRD; 5936 lapic_irq.delivery_mode = APIC_DM_REMRD;
5906 kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); 5937 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
5907} 5938}
5908 5939
5909int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 5940int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -5911,6 +5942,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5911 unsigned long nr, a0, a1, a2, a3, ret; 5942 unsigned long nr, a0, a1, a2, a3, ret;
5912 int op_64_bit, r = 1; 5943 int op_64_bit, r = 1;
5913 5944
5945 kvm_x86_ops->skip_emulated_instruction(vcpu);
5946
5914 if (kvm_hv_hypercall_enabled(vcpu->kvm)) 5947 if (kvm_hv_hypercall_enabled(vcpu->kvm))
5915 return kvm_hv_hypercall(vcpu); 5948 return kvm_hv_hypercall(vcpu);
5916 5949
@@ -6164,7 +6197,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
6164} 6197}
6165 6198
6166/* 6199/*
6167 * Returns 1 to let __vcpu_run() continue the guest execution loop without 6200 * Returns 1 to let vcpu_run() continue the guest execution loop without
6168 * exiting to the userspace. Otherwise, the value will be returned to the 6201 * exiting to the userspace. Otherwise, the value will be returned to the
6169 * userspace. 6202 * userspace.
6170 */ 6203 */
@@ -6301,6 +6334,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6301 set_debugreg(vcpu->arch.eff_db[2], 2); 6334 set_debugreg(vcpu->arch.eff_db[2], 2);
6302 set_debugreg(vcpu->arch.eff_db[3], 3); 6335 set_debugreg(vcpu->arch.eff_db[3], 3);
6303 set_debugreg(vcpu->arch.dr6, 6); 6336 set_debugreg(vcpu->arch.dr6, 6);
6337 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
6304 } 6338 }
6305 6339
6306 trace_kvm_entry(vcpu->vcpu_id); 6340 trace_kvm_entry(vcpu->vcpu_id);
@@ -6382,42 +6416,47 @@ out:
6382 return r; 6416 return r;
6383} 6417}
6384 6418
6419static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
6420{
6421 if (!kvm_arch_vcpu_runnable(vcpu)) {
6422 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6423 kvm_vcpu_block(vcpu);
6424 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6425 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
6426 return 1;
6427 }
6428
6429 kvm_apic_accept_events(vcpu);
6430 switch(vcpu->arch.mp_state) {
6431 case KVM_MP_STATE_HALTED:
6432 vcpu->arch.pv.pv_unhalted = false;
6433 vcpu->arch.mp_state =
6434 KVM_MP_STATE_RUNNABLE;
6435 case KVM_MP_STATE_RUNNABLE:
6436 vcpu->arch.apf.halted = false;
6437 break;
6438 case KVM_MP_STATE_INIT_RECEIVED:
6439 break;
6440 default:
6441 return -EINTR;
6442 break;
6443 }
6444 return 1;
6445}
6385 6446
6386static int __vcpu_run(struct kvm_vcpu *vcpu) 6447static int vcpu_run(struct kvm_vcpu *vcpu)
6387{ 6448{
6388 int r; 6449 int r;
6389 struct kvm *kvm = vcpu->kvm; 6450 struct kvm *kvm = vcpu->kvm;
6390 6451
6391 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6452 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6392 6453
6393 r = 1; 6454 for (;;) {
6394 while (r > 0) {
6395 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 6455 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6396 !vcpu->arch.apf.halted) 6456 !vcpu->arch.apf.halted)
6397 r = vcpu_enter_guest(vcpu); 6457 r = vcpu_enter_guest(vcpu);
6398 else { 6458 else
6399 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6459 r = vcpu_block(kvm, vcpu);
6400 kvm_vcpu_block(vcpu);
6401 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6402 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
6403 kvm_apic_accept_events(vcpu);
6404 switch(vcpu->arch.mp_state) {
6405 case KVM_MP_STATE_HALTED:
6406 vcpu->arch.pv.pv_unhalted = false;
6407 vcpu->arch.mp_state =
6408 KVM_MP_STATE_RUNNABLE;
6409 case KVM_MP_STATE_RUNNABLE:
6410 vcpu->arch.apf.halted = false;
6411 break;
6412 case KVM_MP_STATE_INIT_RECEIVED:
6413 break;
6414 default:
6415 r = -EINTR;
6416 break;
6417 }
6418 }
6419 }
6420
6421 if (r <= 0) 6460 if (r <= 0)
6422 break; 6461 break;
6423 6462
@@ -6429,6 +6468,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
6429 r = -EINTR; 6468 r = -EINTR;
6430 vcpu->run->exit_reason = KVM_EXIT_INTR; 6469 vcpu->run->exit_reason = KVM_EXIT_INTR;
6431 ++vcpu->stat.request_irq_exits; 6470 ++vcpu->stat.request_irq_exits;
6471 break;
6432 } 6472 }
6433 6473
6434 kvm_check_async_pf_completion(vcpu); 6474 kvm_check_async_pf_completion(vcpu);
@@ -6437,6 +6477,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
6437 r = -EINTR; 6477 r = -EINTR;
6438 vcpu->run->exit_reason = KVM_EXIT_INTR; 6478 vcpu->run->exit_reason = KVM_EXIT_INTR;
6439 ++vcpu->stat.signal_exits; 6479 ++vcpu->stat.signal_exits;
6480 break;
6440 } 6481 }
6441 if (need_resched()) { 6482 if (need_resched()) {
6442 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6483 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -6568,7 +6609,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6568 } else 6609 } else
6569 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); 6610 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
6570 6611
6571 r = __vcpu_run(vcpu); 6612 r = vcpu_run(vcpu);
6572 6613
6573out: 6614out:
6574 post_kvm_run_save(vcpu); 6615 post_kvm_run_save(vcpu);
@@ -7075,11 +7116,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
7075 kvm_clear_exception_queue(vcpu); 7116 kvm_clear_exception_queue(vcpu);
7076 7117
7077 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 7118 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
7119 kvm_update_dr0123(vcpu);
7078 vcpu->arch.dr6 = DR6_INIT; 7120 vcpu->arch.dr6 = DR6_INIT;
7079 kvm_update_dr6(vcpu); 7121 kvm_update_dr6(vcpu);
7080 vcpu->arch.dr7 = DR7_FIXED_1; 7122 vcpu->arch.dr7 = DR7_FIXED_1;
7081 kvm_update_dr7(vcpu); 7123 kvm_update_dr7(vcpu);
7082 7124
7125 vcpu->arch.cr2 = 0;
7126
7083 kvm_make_request(KVM_REQ_EVENT, vcpu); 7127 kvm_make_request(KVM_REQ_EVENT, vcpu);
7084 vcpu->arch.apf.msr_val = 0; 7128 vcpu->arch.apf.msr_val = 0;
7085 vcpu->arch.st.msr_val = 0; 7129 vcpu->arch.st.msr_val = 0;
@@ -7240,7 +7284,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7240 7284
7241 vcpu->arch.pv.pv_unhalted = false; 7285 vcpu->arch.pv.pv_unhalted = false;
7242 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 7286 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
7243 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 7287 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
7244 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 7288 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7245 else 7289 else
7246 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 7290 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -7288,6 +7332,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7288 vcpu->arch.guest_supported_xcr0 = 0; 7332 vcpu->arch.guest_supported_xcr0 = 0;
7289 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; 7333 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
7290 7334
7335 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
7336
7291 kvm_async_pf_hash_reset(vcpu); 7337 kvm_async_pf_hash_reset(vcpu);
7292 kvm_pmu_init(vcpu); 7338 kvm_pmu_init(vcpu);
7293 7339
@@ -7428,7 +7474,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
7428 7474
7429 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 7475 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7430 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { 7476 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
7431 kvm_kvfree(free->arch.rmap[i]); 7477 kvfree(free->arch.rmap[i]);
7432 free->arch.rmap[i] = NULL; 7478 free->arch.rmap[i] = NULL;
7433 } 7479 }
7434 if (i == 0) 7480 if (i == 0)
@@ -7436,7 +7482,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
7436 7482
7437 if (!dont || free->arch.lpage_info[i - 1] != 7483 if (!dont || free->arch.lpage_info[i - 1] !=
7438 dont->arch.lpage_info[i - 1]) { 7484 dont->arch.lpage_info[i - 1]) {
7439 kvm_kvfree(free->arch.lpage_info[i - 1]); 7485 kvfree(free->arch.lpage_info[i - 1]);
7440 free->arch.lpage_info[i - 1] = NULL; 7486 free->arch.lpage_info[i - 1] = NULL;
7441 } 7487 }
7442 } 7488 }
@@ -7490,12 +7536,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
7490 7536
7491out_free: 7537out_free:
7492 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 7538 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7493 kvm_kvfree(slot->arch.rmap[i]); 7539 kvfree(slot->arch.rmap[i]);
7494 slot->arch.rmap[i] = NULL; 7540 slot->arch.rmap[i] = NULL;
7495 if (i == 0) 7541 if (i == 0)
7496 continue; 7542 continue;
7497 7543
7498 kvm_kvfree(slot->arch.lpage_info[i - 1]); 7544 kvfree(slot->arch.lpage_info[i - 1]);
7499 slot->arch.lpage_info[i - 1] = NULL; 7545 slot->arch.lpage_info[i - 1] = NULL;
7500 } 7546 }
7501 return -ENOMEM; 7547 return -ENOMEM;
@@ -7618,6 +7664,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7618 new = id_to_memslot(kvm->memslots, mem->slot); 7664 new = id_to_memslot(kvm->memslots, mem->slot);
7619 7665
7620 /* 7666 /*
7667 * Dirty logging tracks sptes in 4k granularity, meaning that large
7668 * sptes have to be split. If live migration is successful, the guest
7669 * in the source machine will be destroyed and large sptes will be
7670 * created in the destination. However, if the guest continues to run
7671 * in the source machine (for example if live migration fails), small
7672 * sptes will remain around and cause bad performance.
7673 *
7674 * Scan sptes if dirty logging has been stopped, dropping those
7675 * which can be collapsed into a single large-page spte. Later
7676 * page faults will create the large-page sptes.
7677 */
7678 if ((change != KVM_MR_DELETE) &&
7679 (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
7680 !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
7681 kvm_mmu_zap_collapsible_sptes(kvm, new);
7682
7683 /*
7621 * Set up write protection and/or dirty logging for the new slot. 7684 * Set up write protection and/or dirty logging for the new slot.
7622 * 7685 *
7623 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have 7686 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ac4453d8520e..717908b16037 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void)
868 /* Some systems map "vectors" to interrupts weirdly. Not us! */ 868 /* Some systems map "vectors" to interrupts weirdly. Not us! */
869 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); 869 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
870 if (i != SYSCALL_VECTOR) 870 if (i != SYSCALL_VECTOR)
871 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 871 set_intr_gate(i, irq_entries_start +
872 8 * (i - FIRST_EXTERNAL_VECTOR));
872 } 873 }
873 874
874 /* 875 /*
@@ -1076,6 +1077,7 @@ static void lguest_load_sp0(struct tss_struct *tss,
1076{ 1077{
1077 lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, 1078 lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
1078 THREAD_SIZE / PAGE_SIZE); 1079 THREAD_SIZE / PAGE_SIZE);
1080 tss->x86_tss.sp0 = thread->sp0;
1079} 1081}
1080 1082
1081/* Let's just say, I wouldn't do debugging under a Guest. */ 1083/* Let's just say, I wouldn't do debugging under a Guest. */
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index f5cc9eb1d51b..082a85167a5b 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -13,16 +13,6 @@
13#include <asm/alternative-asm.h> 13#include <asm/alternative-asm.h>
14#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
15 15
16.macro SAVE reg
17 pushl_cfi %\reg
18 CFI_REL_OFFSET \reg, 0
19.endm
20
21.macro RESTORE reg
22 popl_cfi %\reg
23 CFI_RESTORE \reg
24.endm
25
26.macro read64 reg 16.macro read64 reg
27 movl %ebx, %eax 17 movl %ebx, %eax
28 movl %ecx, %edx 18 movl %ecx, %edx
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8)
67.macro addsub_return func ins insc 57.macro addsub_return func ins insc
68ENTRY(atomic64_\func\()_return_cx8) 58ENTRY(atomic64_\func\()_return_cx8)
69 CFI_STARTPROC 59 CFI_STARTPROC
70 SAVE ebp 60 pushl_cfi_reg ebp
71 SAVE ebx 61 pushl_cfi_reg ebx
72 SAVE esi 62 pushl_cfi_reg esi
73 SAVE edi 63 pushl_cfi_reg edi
74 64
75 movl %eax, %esi 65 movl %eax, %esi
76 movl %edx, %edi 66 movl %edx, %edi
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8)
8910: 7910:
90 movl %ebx, %eax 80 movl %ebx, %eax
91 movl %ecx, %edx 81 movl %ecx, %edx
92 RESTORE edi 82 popl_cfi_reg edi
93 RESTORE esi 83 popl_cfi_reg esi
94 RESTORE ebx 84 popl_cfi_reg ebx
95 RESTORE ebp 85 popl_cfi_reg ebp
96 ret 86 ret
97 CFI_ENDPROC 87 CFI_ENDPROC
98ENDPROC(atomic64_\func\()_return_cx8) 88ENDPROC(atomic64_\func\()_return_cx8)
@@ -104,7 +94,7 @@ addsub_return sub sub sbb
104.macro incdec_return func ins insc 94.macro incdec_return func ins insc
105ENTRY(atomic64_\func\()_return_cx8) 95ENTRY(atomic64_\func\()_return_cx8)
106 CFI_STARTPROC 96 CFI_STARTPROC
107 SAVE ebx 97 pushl_cfi_reg ebx
108 98
109 read64 %esi 99 read64 %esi
1101: 1001:
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8)
11910: 10910:
120 movl %ebx, %eax 110 movl %ebx, %eax
121 movl %ecx, %edx 111 movl %ecx, %edx
122 RESTORE ebx 112 popl_cfi_reg ebx
123 ret 113 ret
124 CFI_ENDPROC 114 CFI_ENDPROC
125ENDPROC(atomic64_\func\()_return_cx8) 115ENDPROC(atomic64_\func\()_return_cx8)
@@ -130,7 +120,7 @@ incdec_return dec sub sbb
130 120
131ENTRY(atomic64_dec_if_positive_cx8) 121ENTRY(atomic64_dec_if_positive_cx8)
132 CFI_STARTPROC 122 CFI_STARTPROC
133 SAVE ebx 123 pushl_cfi_reg ebx
134 124
135 read64 %esi 125 read64 %esi
1361: 1261:
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8)
1462: 1362:
147 movl %ebx, %eax 137 movl %ebx, %eax
148 movl %ecx, %edx 138 movl %ecx, %edx
149 RESTORE ebx 139 popl_cfi_reg ebx
150 ret 140 ret
151 CFI_ENDPROC 141 CFI_ENDPROC
152ENDPROC(atomic64_dec_if_positive_cx8) 142ENDPROC(atomic64_dec_if_positive_cx8)
153 143
154ENTRY(atomic64_add_unless_cx8) 144ENTRY(atomic64_add_unless_cx8)
155 CFI_STARTPROC 145 CFI_STARTPROC
156 SAVE ebp 146 pushl_cfi_reg ebp
157 SAVE ebx 147 pushl_cfi_reg ebx
158/* these just push these two parameters on the stack */ 148/* these just push these two parameters on the stack */
159 SAVE edi 149 pushl_cfi_reg edi
160 SAVE ecx 150 pushl_cfi_reg ecx
161 151
162 movl %eax, %ebp 152 movl %eax, %ebp
163 movl %edx, %edi 153 movl %edx, %edi
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8)
1793: 1693:
180 addl $8, %esp 170 addl $8, %esp
181 CFI_ADJUST_CFA_OFFSET -8 171 CFI_ADJUST_CFA_OFFSET -8
182 RESTORE ebx 172 popl_cfi_reg ebx
183 RESTORE ebp 173 popl_cfi_reg ebp
184 ret 174 ret
1854: 1754:
186 cmpl %edx, 4(%esp) 176 cmpl %edx, 4(%esp)
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8)
192 182
193ENTRY(atomic64_inc_not_zero_cx8) 183ENTRY(atomic64_inc_not_zero_cx8)
194 CFI_STARTPROC 184 CFI_STARTPROC
195 SAVE ebx 185 pushl_cfi_reg ebx
196 186
197 read64 %esi 187 read64 %esi
1981: 1881:
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8)
209 199
210 movl $1, %eax 200 movl $1, %eax
2113: 2013:
212 RESTORE ebx 202 popl_cfi_reg ebx
213 ret 203 ret
214 CFI_ENDPROC 204 CFI_ENDPROC
215ENDPROC(atomic64_inc_not_zero_cx8) 205ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index e78b8eee6615..9bc944a91274 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
51 */ 51 */
52ENTRY(csum_partial) 52ENTRY(csum_partial)
53 CFI_STARTPROC 53 CFI_STARTPROC
54 pushl_cfi %esi 54 pushl_cfi_reg esi
55 CFI_REL_OFFSET esi, 0 55 pushl_cfi_reg ebx
56 pushl_cfi %ebx
57 CFI_REL_OFFSET ebx, 0
58 movl 20(%esp),%eax # Function arg: unsigned int sum 56 movl 20(%esp),%eax # Function arg: unsigned int sum
59 movl 16(%esp),%ecx # Function arg: int len 57 movl 16(%esp),%ecx # Function arg: int len
60 movl 12(%esp),%esi # Function arg: unsigned char *buff 58 movl 12(%esp),%esi # Function arg: unsigned char *buff
@@ -127,14 +125,12 @@ ENTRY(csum_partial)
1276: addl %ecx,%eax 1256: addl %ecx,%eax
128 adcl $0, %eax 126 adcl $0, %eax
1297: 1277:
130 testl $1, 12(%esp) 128 testb $1, 12(%esp)
131 jz 8f 129 jz 8f
132 roll $8, %eax 130 roll $8, %eax
1338: 1318:
134 popl_cfi %ebx 132 popl_cfi_reg ebx
135 CFI_RESTORE ebx 133 popl_cfi_reg esi
136 popl_cfi %esi
137 CFI_RESTORE esi
138 ret 134 ret
139 CFI_ENDPROC 135 CFI_ENDPROC
140ENDPROC(csum_partial) 136ENDPROC(csum_partial)
@@ -145,10 +141,8 @@ ENDPROC(csum_partial)
145 141
146ENTRY(csum_partial) 142ENTRY(csum_partial)
147 CFI_STARTPROC 143 CFI_STARTPROC
148 pushl_cfi %esi 144 pushl_cfi_reg esi
149 CFI_REL_OFFSET esi, 0 145 pushl_cfi_reg ebx
150 pushl_cfi %ebx
151 CFI_REL_OFFSET ebx, 0
152 movl 20(%esp),%eax # Function arg: unsigned int sum 146 movl 20(%esp),%eax # Function arg: unsigned int sum
153 movl 16(%esp),%ecx # Function arg: int len 147 movl 16(%esp),%ecx # Function arg: int len
154 movl 12(%esp),%esi # Function arg: const unsigned char *buf 148 movl 12(%esp),%esi # Function arg: const unsigned char *buf
@@ -251,14 +245,12 @@ ENTRY(csum_partial)
251 addl %ebx,%eax 245 addl %ebx,%eax
252 adcl $0,%eax 246 adcl $0,%eax
25380: 24780:
254 testl $1, 12(%esp) 248 testb $1, 12(%esp)
255 jz 90f 249 jz 90f
256 roll $8, %eax 250 roll $8, %eax
25790: 25190:
258 popl_cfi %ebx 252 popl_cfi_reg ebx
259 CFI_RESTORE ebx 253 popl_cfi_reg esi
260 popl_cfi %esi
261 CFI_RESTORE esi
262 ret 254 ret
263 CFI_ENDPROC 255 CFI_ENDPROC
264ENDPROC(csum_partial) 256ENDPROC(csum_partial)
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic)
298 CFI_STARTPROC 290 CFI_STARTPROC
299 subl $4,%esp 291 subl $4,%esp
300 CFI_ADJUST_CFA_OFFSET 4 292 CFI_ADJUST_CFA_OFFSET 4
301 pushl_cfi %edi 293 pushl_cfi_reg edi
302 CFI_REL_OFFSET edi, 0 294 pushl_cfi_reg esi
303 pushl_cfi %esi 295 pushl_cfi_reg ebx
304 CFI_REL_OFFSET esi, 0
305 pushl_cfi %ebx
306 CFI_REL_OFFSET ebx, 0
307 movl ARGBASE+16(%esp),%eax # sum 296 movl ARGBASE+16(%esp),%eax # sum
308 movl ARGBASE+12(%esp),%ecx # len 297 movl ARGBASE+12(%esp),%ecx # len
309 movl ARGBASE+4(%esp),%esi # src 298 movl ARGBASE+4(%esp),%esi # src
@@ -412,12 +401,9 @@ DST( movb %cl, (%edi) )
412 401
413.previous 402.previous
414 403
415 popl_cfi %ebx 404 popl_cfi_reg ebx
416 CFI_RESTORE ebx 405 popl_cfi_reg esi
417 popl_cfi %esi 406 popl_cfi_reg edi
418 CFI_RESTORE esi
419 popl_cfi %edi
420 CFI_RESTORE edi
421 popl_cfi %ecx # equivalent to addl $4,%esp 407 popl_cfi %ecx # equivalent to addl $4,%esp
422 ret 408 ret
423 CFI_ENDPROC 409 CFI_ENDPROC
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic)
441 427
442ENTRY(csum_partial_copy_generic) 428ENTRY(csum_partial_copy_generic)
443 CFI_STARTPROC 429 CFI_STARTPROC
444 pushl_cfi %ebx 430 pushl_cfi_reg ebx
445 CFI_REL_OFFSET ebx, 0 431 pushl_cfi_reg edi
446 pushl_cfi %edi 432 pushl_cfi_reg esi
447 CFI_REL_OFFSET edi, 0
448 pushl_cfi %esi
449 CFI_REL_OFFSET esi, 0
450 movl ARGBASE+4(%esp),%esi #src 433 movl ARGBASE+4(%esp),%esi #src
451 movl ARGBASE+8(%esp),%edi #dst 434 movl ARGBASE+8(%esp),%edi #dst
452 movl ARGBASE+12(%esp),%ecx #len 435 movl ARGBASE+12(%esp),%ecx #len
@@ -506,12 +489,9 @@ DST( movb %dl, (%edi) )
506 jmp 7b 489 jmp 7b
507.previous 490.previous
508 491
509 popl_cfi %esi 492 popl_cfi_reg esi
510 CFI_RESTORE esi 493 popl_cfi_reg edi
511 popl_cfi %edi 494 popl_cfi_reg ebx
512 CFI_RESTORE edi
513 popl_cfi %ebx
514 CFI_RESTORE ebx
515 ret 495 ret
516 CFI_ENDPROC 496 CFI_ENDPROC
517ENDPROC(csum_partial_copy_generic) 497ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cfa12a6..e67e579c93bd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <asm/dwarf2.h> 2#include <asm/dwarf2.h>
3#include <asm/cpufeature.h>
3#include <asm/alternative-asm.h> 4#include <asm/alternative-asm.h>
4 5
5/* 6/*
6 * Zero a page. 7 * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
7 * rdi page 8 * recommended to use this when possible and we do use them by default.
8 */ 9 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
9ENTRY(clear_page_c) 10 * Otherwise, use original.
11 */
12
13/*
14 * Zero a page.
15 * %rdi - page
16 */
17ENTRY(clear_page)
10 CFI_STARTPROC 18 CFI_STARTPROC
19
20 ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
21 "jmp clear_page_c_e", X86_FEATURE_ERMS
22
11 movl $4096/8,%ecx 23 movl $4096/8,%ecx
12 xorl %eax,%eax 24 xorl %eax,%eax
13 rep stosq 25 rep stosq
14 ret 26 ret
15 CFI_ENDPROC 27 CFI_ENDPROC
16ENDPROC(clear_page_c) 28ENDPROC(clear_page)
17 29
18ENTRY(clear_page_c_e) 30ENTRY(clear_page_orig)
19 CFI_STARTPROC 31 CFI_STARTPROC
20 movl $4096,%ecx
21 xorl %eax,%eax
22 rep stosb
23 ret
24 CFI_ENDPROC
25ENDPROC(clear_page_c_e)
26 32
27ENTRY(clear_page)
28 CFI_STARTPROC
29 xorl %eax,%eax 33 xorl %eax,%eax
30 movl $4096/64,%ecx 34 movl $4096/64,%ecx
31 .p2align 4 35 .p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
45 nop 49 nop
46 ret 50 ret
47 CFI_ENDPROC 51 CFI_ENDPROC
48.Lclear_page_end: 52ENDPROC(clear_page_orig)
49ENDPROC(clear_page)
50
51 /*
52 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
53 * It is recommended to use this when possible.
54 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
55 * Otherwise, use original function.
56 *
57 */
58 53
59#include <asm/cpufeature.h> 54ENTRY(clear_page_c_e)
60 55 CFI_STARTPROC
61 .section .altinstr_replacement,"ax" 56 movl $4096,%ecx
621: .byte 0xeb /* jmp <disp8> */ 57 xorl %eax,%eax
63 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ 58 rep stosb
642: .byte 0xeb /* jmp <disp8> */ 59 ret
65 .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ 60 CFI_ENDPROC
663: 61ENDPROC(clear_page_c_e)
67 .previous
68 .section .altinstructions,"a"
69 altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
70 .Lclear_page_end-clear_page, 2b-1b
71 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
72 .Lclear_page_end-clear_page,3b-2b
73 .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca67212b..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
5#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
6 7
8/*
9 * Some CPUs run faster using the string copy instructions (sane microcode).
10 * It is also a lot simpler. Use this when possible. But, don't use streaming
11 * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
12 * prefetch distance based on SMP/UP.
13 */
7 ALIGN 14 ALIGN
8copy_page_rep: 15ENTRY(copy_page)
9 CFI_STARTPROC 16 CFI_STARTPROC
17 ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
10 movl $4096/8, %ecx 18 movl $4096/8, %ecx
11 rep movsq 19 rep movsq
12 ret 20 ret
13 CFI_ENDPROC 21 CFI_ENDPROC
14ENDPROC(copy_page_rep) 22ENDPROC(copy_page)
15
16/*
17 * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
18 * Could vary the prefetch distance based on SMP/UP.
19*/
20 23
21ENTRY(copy_page) 24ENTRY(copy_page_regs)
22 CFI_STARTPROC 25 CFI_STARTPROC
23 subq $2*8, %rsp 26 subq $2*8, %rsp
24 CFI_ADJUST_CFA_OFFSET 2*8 27 CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
90 addq $2*8, %rsp 93 addq $2*8, %rsp
91 CFI_ADJUST_CFA_OFFSET -2*8 94 CFI_ADJUST_CFA_OFFSET -2*8
92 ret 95 ret
93.Lcopy_page_end:
94 CFI_ENDPROC 96 CFI_ENDPROC
95ENDPROC(copy_page) 97ENDPROC(copy_page_regs)
96
97 /* Some CPUs run faster using the string copy instructions.
98 It is also a lot simpler. Use this when possible */
99
100#include <asm/cpufeature.h>
101
102 .section .altinstr_replacement,"ax"
1031: .byte 0xeb /* jmp <disp8> */
104 .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
1052:
106 .previous
107 .section .altinstructions,"a"
108 altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
109 .Lcopy_page_end-copy_page, 2b-1b
110 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d55594..fa997dfaef24 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
11
12#define FIX_ALIGNMENT 1
13
14#include <asm/current.h> 11#include <asm/current.h>
15#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
16#include <asm/thread_info.h> 13#include <asm/thread_info.h>
@@ -19,33 +16,7 @@
19#include <asm/asm.h> 16#include <asm/asm.h>
20#include <asm/smap.h> 17#include <asm/smap.h>
21 18
22/*
23 * By placing feature2 after feature1 in altinstructions section, we logically
24 * implement:
25 * If CPU has feature2, jmp to alt2 is used
26 * else if CPU has feature1, jmp to alt1 is used
27 * else jmp to orig is used.
28 */
29 .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
300:
31 .byte 0xe9 /* 32bit jump */
32 .long \orig-1f /* by default jump to orig */
331:
34 .section .altinstr_replacement,"ax"
352: .byte 0xe9 /* near jump with 32bit immediate */
36 .long \alt1-1b /* offset */ /* or alternatively to alt1 */
373: .byte 0xe9 /* near jump with 32bit immediate */
38 .long \alt2-1b /* offset */ /* or alternatively to alt2 */
39 .previous
40
41 .section .altinstructions,"a"
42 altinstruction_entry 0b,2b,\feature1,5,5
43 altinstruction_entry 0b,3b,\feature2,5,5
44 .previous
45 .endm
46
47 .macro ALIGN_DESTINATION 19 .macro ALIGN_DESTINATION
48#ifdef FIX_ALIGNMENT
49 /* check for bad alignment of destination */ 20 /* check for bad alignment of destination */
50 movl %edi,%ecx 21 movl %edi,%ecx
51 andl $7,%ecx 22 andl $7,%ecx
@@ -67,7 +38,6 @@
67 38
68 _ASM_EXTABLE(100b,103b) 39 _ASM_EXTABLE(100b,103b)
69 _ASM_EXTABLE(101b,103b) 40 _ASM_EXTABLE(101b,103b)
70#endif
71 .endm 41 .endm
72 42
73/* Standard copy_to_user with segment limit checking */ 43/* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
79 jc bad_to_user 49 jc bad_to_user
80 cmpq TI_addr_limit(%rax),%rcx 50 cmpq TI_addr_limit(%rax),%rcx
81 ja bad_to_user 51 ja bad_to_user
82 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ 52 ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
83 copy_user_generic_unrolled,copy_user_generic_string, \ 53 "jmp copy_user_generic_string", \
84 copy_user_enhanced_fast_string 54 X86_FEATURE_REP_GOOD, \
55 "jmp copy_user_enhanced_fast_string", \
56 X86_FEATURE_ERMS
85 CFI_ENDPROC 57 CFI_ENDPROC
86ENDPROC(_copy_to_user) 58ENDPROC(_copy_to_user)
87 59
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
94 jc bad_from_user 66 jc bad_from_user
95 cmpq TI_addr_limit(%rax),%rcx 67 cmpq TI_addr_limit(%rax),%rcx
96 ja bad_from_user 68 ja bad_from_user
97 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ 69 ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
98 copy_user_generic_unrolled,copy_user_generic_string, \ 70 "jmp copy_user_generic_string", \
99 copy_user_enhanced_fast_string 71 X86_FEATURE_REP_GOOD, \
72 "jmp copy_user_enhanced_fast_string", \
73 X86_FEATURE_ERMS
100 CFI_ENDPROC 74 CFI_ENDPROC
101ENDPROC(_copy_from_user) 75ENDPROC(_copy_from_user)
102 76
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5fefae3..9734182966f3 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
196 196
197 /* handle last odd byte */ 197 /* handle last odd byte */
198.Lhandle_1: 198.Lhandle_1:
199 testl $1, %r10d 199 testb $1, %r10b
200 jz .Lende 200 jz .Lende
201 xorl %ebx, %ebx 201 xorl %ebx, %ebx
202 source 202 source
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1313ae6b478b..8f72b334aea0 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -52,6 +52,13 @@
52 */ 52 */
53void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) 53void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
54{ 54{
55 /*
56 * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
57 * even if the input buffer is long enough to hold them.
58 */
59 if (buf_len > MAX_INSN_SIZE)
60 buf_len = MAX_INSN_SIZE;
61
55 memset(insn, 0, sizeof(*insn)); 62 memset(insn, 0, sizeof(*insn));
56 insn->kaddr = kaddr; 63 insn->kaddr = kaddr;
57 insn->end_kaddr = kaddr + buf_len; 64 insn->end_kaddr = kaddr + buf_len;
@@ -164,6 +171,12 @@ found:
164 /* VEX.W overrides opnd_size */ 171 /* VEX.W overrides opnd_size */
165 insn->opnd_bytes = 8; 172 insn->opnd_bytes = 8;
166 } else { 173 } else {
174 /*
175 * For VEX2, fake VEX3-like byte#2.
176 * Makes it easier to decode vex.W, vex.vvvv,
177 * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
178 */
179 insn->vex_prefix.bytes[2] = b2 & 0x7f;
167 insn->vex_prefix.nbytes = 2; 180 insn->vex_prefix.nbytes = 2;
168 insn->next_byte += 2; 181 insn->next_byte += 2;
169 } 182 }
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9968e7..b046664f5a1c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4
5#include <asm/cpufeature.h> 4#include <asm/cpufeature.h>
6#include <asm/dwarf2.h> 5#include <asm/dwarf2.h>
7#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
8 7
9/* 8/*
9 * We build a jump to memcpy_orig by default which gets NOPped out on
10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
13 */
14
15.weak memcpy
16
17/*
10 * memcpy - Copy a memory block. 18 * memcpy - Copy a memory block.
11 * 19 *
12 * Input: 20 * Input:
@@ -17,15 +25,11 @@
17 * Output: 25 * Output:
18 * rax original destination 26 * rax original destination
19 */ 27 */
28ENTRY(__memcpy)
29ENTRY(memcpy)
30 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
31 "jmp memcpy_erms", X86_FEATURE_ERMS
20 32
21/*
22 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
23 *
24 * This gets patched over the unrolled variant (below) via the
25 * alternative instructions framework:
26 */
27 .section .altinstr_replacement, "ax", @progbits
28.Lmemcpy_c:
29 movq %rdi, %rax 33 movq %rdi, %rax
30 movq %rdx, %rcx 34 movq %rdx, %rcx
31 shrq $3, %rcx 35 shrq $3, %rcx
@@ -34,29 +38,21 @@
34 movl %edx, %ecx 38 movl %edx, %ecx
35 rep movsb 39 rep movsb
36 ret 40 ret
37.Lmemcpy_e: 41ENDPROC(memcpy)
38 .previous 42ENDPROC(__memcpy)
39 43
40/* 44/*
41 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than 45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
42 * memcpy_c. Use memcpy_c_e when possible. 46 * simpler than memcpy. Use memcpy_erms when possible.
43 *
44 * This gets patched over the unrolled variant (below) via the
45 * alternative instructions framework:
46 */ 47 */
47 .section .altinstr_replacement, "ax", @progbits 48ENTRY(memcpy_erms)
48.Lmemcpy_c_e:
49 movq %rdi, %rax 49 movq %rdi, %rax
50 movq %rdx, %rcx 50 movq %rdx, %rcx
51 rep movsb 51 rep movsb
52 ret 52 ret
53.Lmemcpy_e_e: 53ENDPROC(memcpy_erms)
54 .previous
55
56.weak memcpy
57 54
58ENTRY(__memcpy) 55ENTRY(memcpy_orig)
59ENTRY(memcpy)
60 CFI_STARTPROC 56 CFI_STARTPROC
61 movq %rdi, %rax 57 movq %rdi, %rax
62 58
@@ -183,26 +179,4 @@ ENTRY(memcpy)
183.Lend: 179.Lend:
184 retq 180 retq
185 CFI_ENDPROC 181 CFI_ENDPROC
186ENDPROC(memcpy) 182ENDPROC(memcpy_orig)
187ENDPROC(__memcpy)
188
189 /*
190 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
191 * If the feature is supported, memcpy_c_e() is the first choice.
192 * If enhanced rep movsb copy is not available, use fast string copy
193 * memcpy_c() when possible. This is faster and code is simpler than
194 * original memcpy().
195 * Otherwise, original memcpy() is used.
196 * In .altinstructions section, ERMS feature is placed after REG_GOOD
197 * feature to implement the right patch order.
198 *
199 * Replace only beginning, memcpy is used to apply alternatives,
200 * so it is silly to overwrite itself with nops - reboot is the
201 * only outcome...
202 */
203 .section .altinstructions, "a"
204 altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
205 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
206 altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
207 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
208 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530575da..0f8a0d0331b9 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
5 * This assembly file is re-written from memmove_64.c file. 5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> 6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */ 7 */
8#define _STRING_C
9#include <linux/linkage.h> 8#include <linux/linkage.h>
10#include <asm/dwarf2.h> 9#include <asm/dwarf2.h>
11#include <asm/cpufeature.h> 10#include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
44 jg 2f 43 jg 2f
45 44
46.Lmemmove_begin_forward: 45.Lmemmove_begin_forward:
46 ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
47
47 /* 48 /*
48 * movsq instruction have many startup latency 49 * movsq instruction have many startup latency
49 * so we handle small size by general register. 50 * so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
20713: 20813:
208 retq 209 retq
209 CFI_ENDPROC 210 CFI_ENDPROC
210
211 .section .altinstr_replacement,"ax"
212.Lmemmove_begin_forward_efs:
213 /* Forward moving data. */
214 movq %rdx, %rcx
215 rep movsb
216 retq
217.Lmemmove_end_forward_efs:
218 .previous
219
220 .section .altinstructions,"a"
221 altinstruction_entry .Lmemmove_begin_forward, \
222 .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
223 .Lmemmove_end_forward-.Lmemmove_begin_forward, \
224 .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
225 .previous
226ENDPROC(__memmove) 211ENDPROC(__memmove)
227ENDPROC(memmove) 212ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935c6a60..93118fb23976 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
7 7
8.weak memset
9
8/* 10/*
9 * ISO C memset - set a memory block to a byte value. This function uses fast 11 * ISO C memset - set a memory block to a byte value. This function uses fast
10 * string to get better performance than the original function. The code is 12 * string to get better performance than the original function. The code is
11 * simpler and shorter than the orignal function as well. 13 * simpler and shorter than the orignal function as well.
12 * 14 *
13 * rdi destination 15 * rdi destination
14 * rsi value (char) 16 * rsi value (char)
15 * rdx count (bytes) 17 * rdx count (bytes)
16 * 18 *
17 * rax original destination 19 * rax original destination
18 */ 20 */
19 .section .altinstr_replacement, "ax", @progbits 21ENTRY(memset)
20.Lmemset_c: 22ENTRY(__memset)
23 /*
24 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
25 * to use it when possible. If not available, use fast string instructions.
26 *
27 * Otherwise, use original memset function.
28 */
29 ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
30 "jmp memset_erms", X86_FEATURE_ERMS
31
21 movq %rdi,%r9 32 movq %rdi,%r9
22 movq %rdx,%rcx 33 movq %rdx,%rcx
23 andl $7,%edx 34 andl $7,%edx
@@ -31,8 +42,8 @@
31 rep stosb 42 rep stosb
32 movq %r9,%rax 43 movq %r9,%rax
33 ret 44 ret
34.Lmemset_e: 45ENDPROC(memset)
35 .previous 46ENDPROC(__memset)
36 47
37/* 48/*
38 * ISO C memset - set a memory block to a byte value. This function uses 49 * ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
45 * 56 *
46 * rax original destination 57 * rax original destination
47 */ 58 */
48 .section .altinstr_replacement, "ax", @progbits 59ENTRY(memset_erms)
49.Lmemset_c_e:
50 movq %rdi,%r9 60 movq %rdi,%r9
51 movb %sil,%al 61 movb %sil,%al
52 movq %rdx,%rcx 62 movq %rdx,%rcx
53 rep stosb 63 rep stosb
54 movq %r9,%rax 64 movq %r9,%rax
55 ret 65 ret
56.Lmemset_e_e: 66ENDPROC(memset_erms)
57 .previous
58
59.weak memset
60 67
61ENTRY(memset) 68ENTRY(memset_orig)
62ENTRY(__memset)
63 CFI_STARTPROC 69 CFI_STARTPROC
64 movq %rdi,%r10 70 movq %rdi,%r10
65 71
@@ -134,23 +140,4 @@ ENTRY(__memset)
134 jmp .Lafter_bad_alignment 140 jmp .Lafter_bad_alignment
135.Lfinal: 141.Lfinal:
136 CFI_ENDPROC 142 CFI_ENDPROC
137ENDPROC(memset) 143ENDPROC(memset_orig)
138ENDPROC(__memset)
139
140 /* Some CPUs support enhanced REP MOVSB/STOSB feature.
141 * It is recommended to use this when possible.
142 *
143 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
144 * instructions.
145 *
146 * Otherwise, use original memset function.
147 *
148 * In .altinstructions section, ERMS feature is placed after REG_GOOD
149 * feature to implement the right patch order.
150 */
151 .section .altinstructions,"a"
152 altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
153 .Lfinal-__memset,.Lmemset_e-.Lmemset_c
154 altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
155 .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
156 .previous
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index f6d13eefad10..3ca5218fbece 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -14,8 +14,8 @@
14.macro op_safe_regs op 14.macro op_safe_regs op
15ENTRY(\op\()_safe_regs) 15ENTRY(\op\()_safe_regs)
16 CFI_STARTPROC 16 CFI_STARTPROC
17 pushq_cfi %rbx 17 pushq_cfi_reg rbx
18 pushq_cfi %rbp 18 pushq_cfi_reg rbp
19 movq %rdi, %r10 /* Save pointer */ 19 movq %rdi, %r10 /* Save pointer */
20 xorl %r11d, %r11d /* Return value */ 20 xorl %r11d, %r11d /* Return value */
21 movl (%rdi), %eax 21 movl (%rdi), %eax
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs)
35 movl %ebp, 20(%r10) 35 movl %ebp, 20(%r10)
36 movl %esi, 24(%r10) 36 movl %esi, 24(%r10)
37 movl %edi, 28(%r10) 37 movl %edi, 28(%r10)
38 popq_cfi %rbp 38 popq_cfi_reg rbp
39 popq_cfi %rbx 39 popq_cfi_reg rbx
40 ret 40 ret
413: 413:
42 CFI_RESTORE_STATE 42 CFI_RESTORE_STATE
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs)
53.macro op_safe_regs op 53.macro op_safe_regs op
54ENTRY(\op\()_safe_regs) 54ENTRY(\op\()_safe_regs)
55 CFI_STARTPROC 55 CFI_STARTPROC
56 pushl_cfi %ebx 56 pushl_cfi_reg ebx
57 pushl_cfi %ebp 57 pushl_cfi_reg ebp
58 pushl_cfi %esi 58 pushl_cfi_reg esi
59 pushl_cfi %edi 59 pushl_cfi_reg edi
60 pushl_cfi $0 /* Return value */ 60 pushl_cfi $0 /* Return value */
61 pushl_cfi %eax 61 pushl_cfi %eax
62 movl 4(%eax), %ecx 62 movl 4(%eax), %ecx
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs)
80 movl %esi, 24(%eax) 80 movl %esi, 24(%eax)
81 movl %edi, 28(%eax) 81 movl %edi, 28(%eax)
82 popl_cfi %eax 82 popl_cfi %eax
83 popl_cfi %edi 83 popl_cfi_reg edi
84 popl_cfi %esi 84 popl_cfi_reg esi
85 popl_cfi %ebp 85 popl_cfi_reg ebp
86 popl_cfi %ebx 86 popl_cfi_reg ebx
87 ret 87 ret
883: 883:
89 CFI_RESTORE_STATE 89 CFI_RESTORE_STATE
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 5dff5f042468..2322abe4da3b 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -34,10 +34,10 @@
34 */ 34 */
35 35
36#define save_common_regs \ 36#define save_common_regs \
37 pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 37 pushl_cfi_reg ecx
38 38
39#define restore_common_regs \ 39#define restore_common_regs \
40 popl_cfi %ecx; CFI_RESTORE ecx 40 popl_cfi_reg ecx
41 41
42 /* Avoid uglifying the argument copying x86-64 needs to do. */ 42 /* Avoid uglifying the argument copying x86-64 needs to do. */
43 .macro movq src, dst 43 .macro movq src, dst
@@ -64,22 +64,22 @@
64 */ 64 */
65 65
66#define save_common_regs \ 66#define save_common_regs \
67 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ 67 pushq_cfi_reg rdi; \
68 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ 68 pushq_cfi_reg rsi; \
69 pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ 69 pushq_cfi_reg rcx; \
70 pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ 70 pushq_cfi_reg r8; \
71 pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ 71 pushq_cfi_reg r9; \
72 pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ 72 pushq_cfi_reg r10; \
73 pushq_cfi %r11; CFI_REL_OFFSET r11, 0 73 pushq_cfi_reg r11
74 74
75#define restore_common_regs \ 75#define restore_common_regs \
76 popq_cfi %r11; CFI_RESTORE r11; \ 76 popq_cfi_reg r11; \
77 popq_cfi %r10; CFI_RESTORE r10; \ 77 popq_cfi_reg r10; \
78 popq_cfi %r9; CFI_RESTORE r9; \ 78 popq_cfi_reg r9; \
79 popq_cfi %r8; CFI_RESTORE r8; \ 79 popq_cfi_reg r8; \
80 popq_cfi %rcx; CFI_RESTORE rcx; \ 80 popq_cfi_reg rcx; \
81 popq_cfi %rsi; CFI_RESTORE rsi; \ 81 popq_cfi_reg rsi; \
82 popq_cfi %rdi; CFI_RESTORE rdi 82 popq_cfi_reg rdi
83 83
84#endif 84#endif
85 85
@@ -87,12 +87,10 @@
87ENTRY(call_rwsem_down_read_failed) 87ENTRY(call_rwsem_down_read_failed)
88 CFI_STARTPROC 88 CFI_STARTPROC
89 save_common_regs 89 save_common_regs
90 __ASM_SIZE(push,_cfi) %__ASM_REG(dx) 90 __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
91 CFI_REL_OFFSET __ASM_REG(dx), 0
92 movq %rax,%rdi 91 movq %rax,%rdi
93 call rwsem_down_read_failed 92 call rwsem_down_read_failed
94 __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) 93 __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
95 CFI_RESTORE __ASM_REG(dx)
96 restore_common_regs 94 restore_common_regs
97 ret 95 ret
98 CFI_ENDPROC 96 CFI_ENDPROC
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake)
124ENTRY(call_rwsem_downgrade_wake) 122ENTRY(call_rwsem_downgrade_wake)
125 CFI_STARTPROC 123 CFI_STARTPROC
126 save_common_regs 124 save_common_regs
127 __ASM_SIZE(push,_cfi) %__ASM_REG(dx) 125 __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
128 CFI_REL_OFFSET __ASM_REG(dx), 0
129 movq %rax,%rdi 126 movq %rax,%rdi
130 call rwsem_downgrade_wake 127 call rwsem_downgrade_wake
131 __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) 128 __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
132 CFI_RESTORE __ASM_REG(dx)
133 restore_common_regs 129 restore_common_regs
134 ret 130 ret
135 CFI_ENDPROC 131 CFI_ENDPROC
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index e28cdaf5ac2c..5eb715087b80 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -13,12 +13,9 @@
13 .globl \name 13 .globl \name
14\name: 14\name:
15 CFI_STARTPROC 15 CFI_STARTPROC
16 pushl_cfi %eax 16 pushl_cfi_reg eax
17 CFI_REL_OFFSET eax, 0 17 pushl_cfi_reg ecx
18 pushl_cfi %ecx 18 pushl_cfi_reg edx
19 CFI_REL_OFFSET ecx, 0
20 pushl_cfi %edx
21 CFI_REL_OFFSET edx, 0
22 19
23 .if \put_ret_addr_in_eax 20 .if \put_ret_addr_in_eax
24 /* Place EIP in the arg1 */ 21 /* Place EIP in the arg1 */
@@ -26,12 +23,9 @@
26 .endif 23 .endif
27 24
28 call \func 25 call \func
29 popl_cfi %edx 26 popl_cfi_reg edx
30 CFI_RESTORE edx 27 popl_cfi_reg ecx
31 popl_cfi %ecx 28 popl_cfi_reg eax
32 CFI_RESTORE ecx
33 popl_cfi %eax
34 CFI_RESTORE eax
35 ret 29 ret
36 CFI_ENDPROC 30 CFI_ENDPROC
37 _ASM_NOKPROBE(\name) 31 _ASM_NOKPROBE(\name)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index b30b5ebd614a..f89ba4e93025 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -17,9 +17,18 @@
17 CFI_STARTPROC 17 CFI_STARTPROC
18 18
19 /* this one pushes 9 elems, the next one would be %rIP */ 19 /* this one pushes 9 elems, the next one would be %rIP */
20 SAVE_ARGS 20 pushq_cfi_reg rdi
21 pushq_cfi_reg rsi
22 pushq_cfi_reg rdx
23 pushq_cfi_reg rcx
24 pushq_cfi_reg rax
25 pushq_cfi_reg r8
26 pushq_cfi_reg r9
27 pushq_cfi_reg r10
28 pushq_cfi_reg r11
21 29
22 .if \put_ret_addr_in_rdi 30 .if \put_ret_addr_in_rdi
31 /* 9*8(%rsp) is return addr on stack */
23 movq_cfi_restore 9*8, rdi 32 movq_cfi_restore 9*8, rdi
24 .endif 33 .endif
25 34
@@ -45,11 +54,22 @@
45#endif 54#endif
46#endif 55#endif
47 56
48 /* SAVE_ARGS below is used only for the .cfi directives it contains. */ 57#if defined(CONFIG_TRACE_IRQFLAGS) \
58 || defined(CONFIG_DEBUG_LOCK_ALLOC) \
59 || defined(CONFIG_PREEMPT)
49 CFI_STARTPROC 60 CFI_STARTPROC
50 SAVE_ARGS 61 CFI_ADJUST_CFA_OFFSET 9*8
51restore: 62restore:
52 RESTORE_ARGS 63 popq_cfi_reg r11
64 popq_cfi_reg r10
65 popq_cfi_reg r9
66 popq_cfi_reg r8
67 popq_cfi_reg rax
68 popq_cfi_reg rcx
69 popq_cfi_reg rdx
70 popq_cfi_reg rsi
71 popq_cfi_reg rdi
53 ret 72 ret
54 CFI_ENDPROC 73 CFI_ENDPROC
55 _ASM_NOKPROBE(restore) 74 _ASM_NOKPROBE(restore)
75#endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index c905e89e19fe..1f33b3d1fd68 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -69,21 +69,20 @@ EXPORT_SYMBOL(copy_in_user);
69 * it is not necessary to optimize tail handling. 69 * it is not necessary to optimize tail handling.
70 */ 70 */
71__visible unsigned long 71__visible unsigned long
72copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) 72copy_user_handle_tail(char *to, char *from, unsigned len)
73{ 73{
74 char c;
75 unsigned zero_len;
76
77 for (; len; --len, to++) { 74 for (; len; --len, to++) {
75 char c;
76
78 if (__get_user_nocheck(c, from++, sizeof(char))) 77 if (__get_user_nocheck(c, from++, sizeof(char)))
79 break; 78 break;
80 if (__put_user_nocheck(c, to, sizeof(char))) 79 if (__put_user_nocheck(c, to, sizeof(char)))
81 break; 80 break;
82 } 81 }
83
84 for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
85 if (__put_user_nocheck(c, to++, sizeof(char)))
86 break;
87 clac(); 82 clac();
83
84 /* If the destination is a kernel buffer, we always clear the end */
85 if ((unsigned long)to >= TASK_SIZE_MAX)
86 memset(to, 0, len);
88 return len; 87 return len;
89} 88}
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1a2be7c6895d..816488c0b97e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -273,6 +273,9 @@ dd: ESC
273de: ESC 273de: ESC
274df: ESC 274df: ESC
275# 0xe0 - 0xef 275# 0xe0 - 0xef
276# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
277# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
278# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
276e0: LOOPNE/LOOPNZ Jb (f64) 279e0: LOOPNE/LOOPNZ Jb (f64)
277e1: LOOPE/LOOPZ Jb (f64) 280e1: LOOPE/LOOPZ Jb (f64)
278e2: LOOP Jb (f64) 281e2: LOOP Jb (f64)
@@ -281,6 +284,10 @@ e4: IN AL,Ib
281e5: IN eAX,Ib 284e5: IN eAX,Ib
282e6: OUT Ib,AL 285e6: OUT Ib,AL
283e7: OUT Ib,eAX 286e7: OUT Ib,eAX
287# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
288# in "near" jumps and calls is 16-bit. For CALL,
289# push of return address is 16-bit wide, RSP is decremented by 2
290# but is not truncated to 16 bits, unlike RIP.
284e8: CALL Jz (f64) 291e8: CALL Jz (f64)
285e9: JMP-near Jz (f64) 292e9: JMP-near Jz (f64)
286ea: JMP-far Ap (i64) 293ea: JMP-far Ap (i64)
@@ -456,6 +463,7 @@ AVXcode: 1
4567e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) 4637e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
4577f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) 4647f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
458# 0x0f 0x80-0x8f 465# 0x0f 0x80-0x8f
466# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
45980: JO Jz (f64) 46780: JO Jz (f64)
46081: JNO Jz (f64) 46881: JNO Jz (f64)
46182: JB/JC/JNAE Jz (f64) 46982: JB/JC/JNAE Jz (f64)
@@ -842,6 +850,7 @@ EndTable
842GrpTable: Grp5 850GrpTable: Grp5
8430: INC Ev 8510: INC Ev
8441: DEC Ev 8521: DEC Ev
853# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
8452: CALLN Ev (f64) 8542: CALLN Ev (f64)
8463: CALLF Ep 8553: CALLF Ep
8474: JMPN Ev (f64) 8564: JMPN Ev (f64)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
32obj-$(CONFIG_ACPI_NUMA) += srat.o 32obj-$(CONFIG_ACPI_NUMA) += srat.o
33obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 33obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
34 34
35obj-$(CONFIG_MEMTEST) += memtest.o
36
37obj-$(CONFIG_X86_INTEL_MPX) += mpx.o 35obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ede025fb46f1..181c53bac3a7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
59 int ret = 0; 59 int ret = 0;
60 60
61 /* kprobe_running() needs smp_processor_id() */ 61 /* kprobe_running() needs smp_processor_id() */
62 if (kprobes_built_in() && !user_mode_vm(regs)) { 62 if (kprobes_built_in() && !user_mode(regs)) {
63 preempt_disable(); 63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 65 ret = 1;
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
148 instr = (void *)convert_ip_to_linear(current, regs); 148 instr = (void *)convert_ip_to_linear(current, regs);
149 max_instr = instr + 15; 149 max_instr = instr + 15;
150 150
151 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 151 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
152 return 0; 152 return 0;
153 153
154 while (instr < max_instr) { 154 while (instr < max_instr) {
@@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
1035 if (error_code & PF_USER) 1035 if (error_code & PF_USER)
1036 return false; 1036 return false;
1037 1037
1038 if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) 1038 if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
1039 return false; 1039 return false;
1040 1040
1041 return true; 1041 return true;
@@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1140 * User-mode registers count as a user access even for any 1140 * User-mode registers count as a user access even for any
1141 * potential system fault or CPU buglet: 1141 * potential system fault or CPU buglet:
1142 */ 1142 */
1143 if (user_mode_vm(regs)) { 1143 if (user_mode(regs)) {
1144 local_irq_enable(); 1144 local_irq_enable();
1145 error_code |= PF_USER; 1145 error_code |= PF_USER;
1146 flags |= FAULT_FLAG_USER; 1146 flags |= FAULT_FLAG_USER;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a110efca6d06..1d553186c434 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,29 +29,33 @@
29 29
30/* 30/*
31 * Tables translating between page_cache_type_t and pte encoding. 31 * Tables translating between page_cache_type_t and pte encoding.
32 * Minimal supported modes are defined statically, modified if more supported 32 *
33 * cache modes are available. 33 * Minimal supported modes are defined statically, they are modified
34 * Index into __cachemode2pte_tbl is the cachemode. 34 * during bootup if more supported cache modes are available.
35 * Index into __pte2cachemode_tbl are the caching attribute bits of the pte 35 *
36 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. 36 * Index into __cachemode2pte_tbl[] is the cachemode.
37 *
38 * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
39 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
37 */ 40 */
38uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { 41uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
39 [_PAGE_CACHE_MODE_WB] = 0, 42 [_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
40 [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, 43 [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 ,
41 [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, 44 [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
42 [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, 45 [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
43 [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, 46 [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
44 [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, 47 [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
45}; 48};
46EXPORT_SYMBOL(__cachemode2pte_tbl); 49EXPORT_SYMBOL(__cachemode2pte_tbl);
50
47uint8_t __pte2cachemode_tbl[8] = { 51uint8_t __pte2cachemode_tbl[8] = {
48 [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, 52 [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
49 [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, 53 [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC,
50 [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, 54 [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
51 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, 55 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC,
52 [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, 56 [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
53 [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, 57 [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
54 [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, 58 [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
55 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, 59 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
56}; 60};
57EXPORT_SYMBOL(__pte2cachemode_tbl); 61EXPORT_SYMBOL(__pte2cachemode_tbl);
@@ -131,21 +135,7 @@ void __init early_alloc_pgt_buf(void)
131 135
132int after_bootmem; 136int after_bootmem;
133 137
134int direct_gbpages 138early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
135#ifdef CONFIG_DIRECT_GBPAGES
136 = 1
137#endif
138;
139
140static void __init init_gbpages(void)
141{
142#ifdef CONFIG_X86_64
143 if (direct_gbpages && cpu_has_gbpages)
144 printk(KERN_INFO "Using GB pages for direct mapping\n");
145 else
146 direct_gbpages = 0;
147#endif
148}
149 139
150struct map_range { 140struct map_range {
151 unsigned long start; 141 unsigned long start;
@@ -157,16 +147,12 @@ static int page_size_mask;
157 147
158static void __init probe_page_size_mask(void) 148static void __init probe_page_size_mask(void)
159{ 149{
160 init_gbpages();
161
162#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) 150#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
163 /* 151 /*
164 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 152 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
165 * This will simplify cpa(), which otherwise needs to support splitting 153 * This will simplify cpa(), which otherwise needs to support splitting
166 * large pages into small in interrupt context, etc. 154 * large pages into small in interrupt context, etc.
167 */ 155 */
168 if (direct_gbpages)
169 page_size_mask |= 1 << PG_LEVEL_1G;
170 if (cpu_has_pse) 156 if (cpu_has_pse)
171 page_size_mask |= 1 << PG_LEVEL_2M; 157 page_size_mask |= 1 << PG_LEVEL_2M;
172#endif 158#endif
@@ -179,6 +165,15 @@ static void __init probe_page_size_mask(void)
179 if (cpu_has_pge) { 165 if (cpu_has_pge) {
180 cr4_set_bits_and_update_boot(X86_CR4_PGE); 166 cr4_set_bits_and_update_boot(X86_CR4_PGE);
181 __supported_pte_mask |= _PAGE_GLOBAL; 167 __supported_pte_mask |= _PAGE_GLOBAL;
168 } else
169 __supported_pte_mask &= ~_PAGE_GLOBAL;
170
171 /* Enable 1 GB linear kernel mappings if available: */
172 if (direct_gbpages && cpu_has_gbpages) {
173 printk(KERN_INFO "Using GB pages for direct mapping\n");
174 page_size_mask |= 1 << PG_LEVEL_1G;
175 } else {
176 direct_gbpages = 0;
182 } 177 }
183} 178}
184 179
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 30eb05ae7061..3fba623e3ba5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
130 return 0; 130 return 0;
131} 131}
132 132
133static int __init parse_direct_gbpages_off(char *arg)
134{
135 direct_gbpages = 0;
136 return 0;
137}
138early_param("nogbpages", parse_direct_gbpages_off);
139
140static int __init parse_direct_gbpages_on(char *arg)
141{
142 direct_gbpages = 1;
143 return 0;
144}
145early_param("gbpages", parse_direct_gbpages_on);
146
147/* 133/*
148 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the 134 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
149 * physical space so we can cache the place of the first one and move 135 * physical space so we can cache the place of the first one and move
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..5ead4d6cf3a7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
67 67
68/* 68/*
69 * Remap an arbitrary physical address space into the kernel virtual 69 * Remap an arbitrary physical address space into the kernel virtual
70 * address space. Needed when the kernel wants to access high addresses 70 * address space. It transparently creates kernel huge I/O mapping when
71 * directly. 71 * the physical address is aligned by a huge page size (1GB or 2MB) and
72 * the requested size is at least the huge page size.
73 *
74 * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
75 * Therefore, the mapping code falls back to use a smaller page toward 4KB
76 * when a mapping range is covered by non-WB type of MTRRs.
72 * 77 *
73 * NOTE! We need to allow non-page-aligned mappings too: we will obviously 78 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
74 * have to convert them into an offset in a page-aligned mapping, but the 79 * have to convert them into an offset in a page-aligned mapping, but the
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr)
326} 331}
327EXPORT_SYMBOL(iounmap); 332EXPORT_SYMBOL(iounmap);
328 333
334int arch_ioremap_pud_supported(void)
335{
336#ifdef CONFIG_X86_64
337 return cpu_has_gbpages;
338#else
339 return 0;
340#endif
341}
342
343int arch_ioremap_pmd_supported(void)
344{
345 return cpu_has_pse;
346}
347
329/* 348/*
330 * Convert a physical pointer to a virtual kernel pointer for /dev/mem 349 * Convert a physical pointer to a virtual kernel pointer for /dev/mem
331 * access 350 * access
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
deleted file mode 100644
index 1e9da795767a..000000000000
--- a/arch/x86/mm/memtest.c
+++ /dev/null
@@ -1,118 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9#include <linux/memblock.h>
10
11static u64 patterns[] __initdata = {
12 /* The first entry has to be 0 to leave memtest with zeroed memory */
13 0,
14 0xffffffffffffffffULL,
15 0x5555555555555555ULL,
16 0xaaaaaaaaaaaaaaaaULL,
17 0x1111111111111111ULL,
18 0x2222222222222222ULL,
19 0x4444444444444444ULL,
20 0x8888888888888888ULL,
21 0x3333333333333333ULL,
22 0x6666666666666666ULL,
23 0x9999999999999999ULL,
24 0xccccccccccccccccULL,
25 0x7777777777777777ULL,
26 0xbbbbbbbbbbbbbbbbULL,
27 0xddddddddddddddddULL,
28 0xeeeeeeeeeeeeeeeeULL,
29 0x7a6c7258554e494cULL, /* yeah ;-) */
30};
31
32static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
33{
34 printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
35 (unsigned long long) pattern,
36 (unsigned long long) start_bad,
37 (unsigned long long) end_bad);
38 memblock_reserve(start_bad, end_bad - start_bad);
39}
40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{
43 u64 *p, *start, *end;
44 u64 start_bad, last_bad;
45 u64 start_phys_aligned;
46 const size_t incr = sizeof(pattern);
47
48 start_phys_aligned = ALIGN(start_phys, incr);
49 start = __va(start_phys_aligned);
50 end = start + (size - (start_phys_aligned - start_phys)) / incr;
51 start_bad = 0;
52 last_bad = 0;
53
54 for (p = start; p < end; p++)
55 *p = pattern;
56
57 for (p = start; p < end; p++, start_phys_aligned += incr) {
58 if (*p == pattern)
59 continue;
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 continue;
63 }
64 if (start_bad)
65 reserve_bad_mem(pattern, start_bad, last_bad + incr);
66 start_bad = last_bad = start_phys_aligned;
67 }
68 if (start_bad)
69 reserve_bad_mem(pattern, start_bad, last_bad + incr);
70}
71
72static void __init do_one_pass(u64 pattern, u64 start, u64 end)
73{
74 u64 i;
75 phys_addr_t this_start, this_end;
76
77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
78 this_start = clamp_t(phys_addr_t, this_start, start, end);
79 this_end = clamp_t(phys_addr_t, this_end, start, end);
80 if (this_start < this_end) {
81 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
82 (unsigned long long)this_start,
83 (unsigned long long)this_end,
84 (unsigned long long)cpu_to_be64(pattern));
85 memtest(pattern, this_start, this_end - this_start);
86 }
87 }
88}
89
90/* default is disabled */
91static int memtest_pattern __initdata;
92
93static int __init parse_memtest(char *arg)
94{
95 if (arg)
96 memtest_pattern = simple_strtoul(arg, NULL, 0);
97 else
98 memtest_pattern = ARRAY_SIZE(patterns);
99
100 return 0;
101}
102
103early_param("memtest", parse_memtest);
104
105void __init early_memtest(unsigned long start, unsigned long end)
106{
107 unsigned int i;
108 unsigned int idx = 0;
109
110 if (!memtest_pattern)
111 return;
112
113 printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
114 for (i = memtest_pattern-1; i < UINT_MAX; --i) {
115 idx = i % ARRAY_SIZE(patterns);
116 do_one_pass(patterns[idx], start, end);
117 }
118}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,24 +65,23 @@ static int mmap_is_legacy(void)
65 return sysctl_legacy_va_layout; 65 return sysctl_legacy_va_layout;
66} 66}
67 67
68static unsigned long mmap_rnd(void) 68unsigned long arch_mmap_rnd(void)
69{ 69{
70 unsigned long rnd = 0; 70 unsigned long rnd;
71 71
72 /* 72 /*
73 * 8 bits of randomness in 32bit mmaps, 20 address space bits 73 * 8 bits of randomness in 32bit mmaps, 20 address space bits
74 * 28 bits of randomness in 64bit mmaps, 40 address space bits 74 * 28 bits of randomness in 64bit mmaps, 40 address space bits
75 */ 75 */
76 if (current->flags & PF_RANDOMIZE) { 76 if (mmap_is_ia32())
77 if (mmap_is_ia32()) 77 rnd = (unsigned long)get_random_int() % (1<<8);
78 rnd = get_random_int() % (1<<8); 78 else
79 else 79 rnd = (unsigned long)get_random_int() % (1<<28);
80 rnd = get_random_int() % (1<<28); 80
81 }
82 return rnd << PAGE_SHIFT; 81 return rnd << PAGE_SHIFT;
83} 82}
84 83
85static unsigned long mmap_base(void) 84static unsigned long mmap_base(unsigned long rnd)
86{ 85{
87 unsigned long gap = rlimit(RLIMIT_STACK); 86 unsigned long gap = rlimit(RLIMIT_STACK);
88 87
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
91 else if (gap > MAX_GAP) 90 else if (gap > MAX_GAP)
92 gap = MAX_GAP; 91 gap = MAX_GAP;
93 92
94 return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); 93 return PAGE_ALIGN(TASK_SIZE - gap - rnd);
95} 94}
96 95
97/* 96/*
98 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
99 * does, but not when emulating X86_32 98 * does, but not when emulating X86_32
100 */ 99 */
101static unsigned long mmap_legacy_base(void) 100static unsigned long mmap_legacy_base(unsigned long rnd)
102{ 101{
103 if (mmap_is_ia32()) 102 if (mmap_is_ia32())
104 return TASK_UNMAPPED_BASE; 103 return TASK_UNMAPPED_BASE;
105 else 104 else
106 return TASK_UNMAPPED_BASE + mmap_rnd(); 105 return TASK_UNMAPPED_BASE + rnd;
107} 106}
108 107
109/* 108/*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
112 */ 111 */
113void arch_pick_mmap_layout(struct mm_struct *mm) 112void arch_pick_mmap_layout(struct mm_struct *mm)
114{ 113{
115 mm->mmap_legacy_base = mmap_legacy_base(); 114 unsigned long random_factor = 0UL;
116 mm->mmap_base = mmap_base(); 115
116 if (current->flags & PF_RANDOMIZE)
117 random_factor = arch_mmap_rnd();
118
119 mm->mmap_legacy_base = mmap_legacy_base(random_factor);
117 120
118 if (mmap_is_legacy()) { 121 if (mmap_is_legacy()) {
119 mm->mmap_base = mm->mmap_legacy_base; 122 mm->mmap_base = mm->mmap_legacy_base;
120 mm->get_unmapped_area = arch_get_unmapped_area; 123 mm->get_unmapped_area = arch_get_unmapped_area;
121 } else { 124 } else {
125 mm->mmap_base = mmap_base(random_factor);
122 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 126 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
123 } 127 }
124} 128}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index cd4785bbacb9..4053bb58bf92 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -482,9 +482,16 @@ static void __init numa_clear_kernel_node_hotplug(void)
482 &memblock.reserved, mb->nid); 482 &memblock.reserved, mb->nid);
483 } 483 }
484 484
485 /* Mark all kernel nodes. */ 485 /*
486 * Mark all kernel nodes.
487 *
488 * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
489 * may not include all the memblock.reserved memory ranges because
490 * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
491 */
486 for_each_memblock(reserved, r) 492 for_each_memblock(reserved, r)
487 node_set(r->nid, numa_kernel_nodes); 493 if (r->nid != MAX_NUMNODES)
494 node_set(r->nid, numa_kernel_nodes);
488 495
489 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ 496 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
490 for (i = 0; i < numa_meminfo.nr_blks; i++) { 497 for (i = 0; i < numa_meminfo.nr_blks; i++) {
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 536ea2fb6e33..89af288ec674 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m)
81 seq_printf(m, "DirectMap4M: %8lu kB\n", 81 seq_printf(m, "DirectMap4M: %8lu kB\n",
82 direct_pages_count[PG_LEVEL_2M] << 12); 82 direct_pages_count[PG_LEVEL_2M] << 12);
83#endif 83#endif
84#ifdef CONFIG_X86_64
85 if (direct_gbpages) 84 if (direct_gbpages)
86 seq_printf(m, "DirectMap1G: %8lu kB\n", 85 seq_printf(m, "DirectMap1G: %8lu kB\n",
87 direct_pages_count[PG_LEVEL_1G] << 20); 86 direct_pages_count[PG_LEVEL_1G] << 20);
88#endif
89} 87}
90#else 88#else
91static inline void split_page_count(int level) { } 89static inline void split_page_count(int level) { }
@@ -1654,13 +1652,11 @@ int set_memory_ro(unsigned long addr, int numpages)
1654{ 1652{
1655 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); 1653 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1656} 1654}
1657EXPORT_SYMBOL_GPL(set_memory_ro);
1658 1655
1659int set_memory_rw(unsigned long addr, int numpages) 1656int set_memory_rw(unsigned long addr, int numpages)
1660{ 1657{
1661 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); 1658 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1662} 1659}
1663EXPORT_SYMBOL_GPL(set_memory_rw);
1664 1660
1665int set_memory_np(unsigned long addr, int numpages) 1661int set_memory_np(unsigned long addr, int numpages)
1666{ 1662{
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 7ac68698406c..35af6771a95a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
610} 610}
611 611
612#ifdef CONFIG_STRICT_DEVMEM 612#ifdef CONFIG_STRICT_DEVMEM
613/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ 613/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
614static inline int range_is_allowed(unsigned long pfn, unsigned long size) 614static inline int range_is_allowed(unsigned long pfn, unsigned long size)
615{ 615{
616 return 1; 616 return 1;
@@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
628 628
629 while (cursor < to) { 629 while (cursor < to) {
630 if (!devmem_is_allowed(pfn)) { 630 if (!devmem_is_allowed(pfn)) {
631 printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", 631 printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
632 current->comm, from, to - 1); 632 current->comm, from, to - 1);
633 return 0; 633 return 0;
634 } 634 }
635 cursor += PAGE_SIZE; 635 cursor += PAGE_SIZE;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7b22adaad4f1..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
4#include <asm/pgtable.h> 4#include <asm/pgtable.h>
5#include <asm/tlb.h> 5#include <asm/tlb.h>
6#include <asm/fixmap.h> 6#include <asm/fixmap.h>
7#include <asm/mtrr.h>
7 8
8#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 9#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
9 10
@@ -58,7 +59,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
58 tlb_remove_page(tlb, pte); 59 tlb_remove_page(tlb, pte);
59} 60}
60 61
61#if PAGETABLE_LEVELS > 2 62#if CONFIG_PGTABLE_LEVELS > 2
62void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 63void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
63{ 64{
64 struct page *page = virt_to_page(pmd); 65 struct page *page = virt_to_page(pmd);
@@ -74,14 +75,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
74 tlb_remove_page(tlb, page); 75 tlb_remove_page(tlb, page);
75} 76}
76 77
77#if PAGETABLE_LEVELS > 3 78#if CONFIG_PGTABLE_LEVELS > 3
78void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 79void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
79{ 80{
80 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
81 tlb_remove_page(tlb, virt_to_page(pud)); 82 tlb_remove_page(tlb, virt_to_page(pud));
82} 83}
83#endif /* PAGETABLE_LEVELS > 3 */ 84#endif /* CONFIG_PGTABLE_LEVELS > 3 */
84#endif /* PAGETABLE_LEVELS > 2 */ 85#endif /* CONFIG_PGTABLE_LEVELS > 2 */
85 86
86static inline void pgd_list_add(pgd_t *pgd) 87static inline void pgd_list_add(pgd_t *pgd)
87{ 88{
@@ -117,9 +118,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
117 /* If the pgd points to a shared pagetable level (either the 118 /* If the pgd points to a shared pagetable level (either the
118 ptes in non-PAE, or shared PMD in PAE), then just copy the 119 ptes in non-PAE, or shared PMD in PAE), then just copy the
119 references from swapper_pg_dir. */ 120 references from swapper_pg_dir. */
120 if (PAGETABLE_LEVELS == 2 || 121 if (CONFIG_PGTABLE_LEVELS == 2 ||
121 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || 122 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
122 PAGETABLE_LEVELS == 4) { 123 CONFIG_PGTABLE_LEVELS == 4) {
123 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 124 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
124 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 125 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
125 KERNEL_PGD_PTRS); 126 KERNEL_PGD_PTRS);
@@ -275,12 +276,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
275 } 276 }
276} 277}
277 278
279/*
280 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
281 * assumes that pgd should be in one page.
282 *
283 * But kernel with PAE paging that is not running as a Xen domain
284 * only needs to allocate 32 bytes for pgd instead of one page.
285 */
286#ifdef CONFIG_X86_PAE
287
288#include <linux/slab.h>
289
290#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
291#define PGD_ALIGN 32
292
293static struct kmem_cache *pgd_cache;
294
295static int __init pgd_cache_init(void)
296{
297 /*
298 * When PAE kernel is running as a Xen domain, it does not use
299 * shared kernel pmd. And this requires a whole page for pgd.
300 */
301 if (!SHARED_KERNEL_PMD)
302 return 0;
303
304 /*
305 * when PAE kernel is not running as a Xen domain, it uses
306 * shared kernel pmd. Shared kernel pmd does not require a whole
307 * page for pgd. We are able to just allocate a 32-byte for pgd.
308 * During boot time, we create a 32-byte slab for pgd table allocation.
309 */
310 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
311 SLAB_PANIC, NULL);
312 if (!pgd_cache)
313 return -ENOMEM;
314
315 return 0;
316}
317core_initcall(pgd_cache_init);
318
319static inline pgd_t *_pgd_alloc(void)
320{
321 /*
322 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
323 * We allocate one page for pgd.
324 */
325 if (!SHARED_KERNEL_PMD)
326 return (pgd_t *)__get_free_page(PGALLOC_GFP);
327
328 /*
329 * Now PAE kernel is not running as a Xen domain. We can allocate
330 * a 32-byte slab for pgd to save memory space.
331 */
332 return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
333}
334
335static inline void _pgd_free(pgd_t *pgd)
336{
337 if (!SHARED_KERNEL_PMD)
338 free_page((unsigned long)pgd);
339 else
340 kmem_cache_free(pgd_cache, pgd);
341}
342#else
343static inline pgd_t *_pgd_alloc(void)
344{
345 return (pgd_t *)__get_free_page(PGALLOC_GFP);
346}
347
348static inline void _pgd_free(pgd_t *pgd)
349{
350 free_page((unsigned long)pgd);
351}
352#endif /* CONFIG_X86_PAE */
353
278pgd_t *pgd_alloc(struct mm_struct *mm) 354pgd_t *pgd_alloc(struct mm_struct *mm)
279{ 355{
280 pgd_t *pgd; 356 pgd_t *pgd;
281 pmd_t *pmds[PREALLOCATED_PMDS]; 357 pmd_t *pmds[PREALLOCATED_PMDS];
282 358
283 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); 359 pgd = _pgd_alloc();
284 360
285 if (pgd == NULL) 361 if (pgd == NULL)
286 goto out; 362 goto out;
@@ -310,7 +386,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
310out_free_pmds: 386out_free_pmds:
311 free_pmds(mm, pmds); 387 free_pmds(mm, pmds);
312out_free_pgd: 388out_free_pgd:
313 free_page((unsigned long)pgd); 389 _pgd_free(pgd);
314out: 390out:
315 return NULL; 391 return NULL;
316} 392}
@@ -320,7 +396,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
320 pgd_mop_up_pmds(mm, pgd); 396 pgd_mop_up_pmds(mm, pgd);
321 pgd_dtor(pgd); 397 pgd_dtor(pgd);
322 paravirt_pgd_free(mm, pgd); 398 paravirt_pgd_free(mm, pgd);
323 free_page((unsigned long)pgd); 399 _pgd_free(pgd);
324} 400}
325 401
326/* 402/*
@@ -485,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
485{ 561{
486 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 562 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
487} 563}
564
565#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
566int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
567{
568 u8 mtrr;
569
570 /*
571 * Do not use a huge page when the range is covered by non-WB type
572 * of MTRRs.
573 */
574 mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
575 if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
576 return 0;
577
578 prot = pgprot_4k_2_large(prot);
579
580 set_pte((pte_t *)pud, pfn_pte(
581 (u64)addr >> PAGE_SHIFT,
582 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
583
584 return 1;
585}
586
587int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
588{
589 u8 mtrr;
590
591 /*
592 * Do not use a huge page when the range is covered by non-WB type
593 * of MTRRs.
594 */
595 mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
596 if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
597 return 0;
598
599 prot = pgprot_4k_2_large(prot);
600
601 set_pte((pte_t *)pmd, pfn_pte(
602 (u64)addr >> PAGE_SHIFT,
603 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
604
605 return 1;
606}
607
608int pud_clear_huge(pud_t *pud)
609{
610 if (pud_large(*pud)) {
611 pud_clear(pud);
612 return 1;
613 }
614
615 return 0;
616}
617
618int pmd_clear_huge(pmd_t *pmd)
619{
620 if (pmd_large(*pmd)) {
621 pmd_clear(pmd);
622 return 1;
623 }
624
625 return 0;
626}
627#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 5d04be5efb64..4e664bdb535a 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
111{ 111{
112 struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); 112 struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
113 113
114 if (!user_mode_vm(regs)) { 114 if (!user_mode(regs)) {
115 unsigned long stack = kernel_stack_pointer(regs); 115 unsigned long stack = kernel_stack_pointer(regs);
116 if (depth) 116 if (depth)
117 dump_trace(NULL, regs, (unsigned long *)stack, 0, 117 dump_trace(NULL, regs, (unsigned long *)stack, 0,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2fb384724ebb..8fd6f44aee83 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum)
490 if (!bus) { 490 if (!bus) {
491 pci_free_resource_list(&resources); 491 pci_free_resource_list(&resources);
492 kfree(sd); 492 kfree(sd);
493 return;
493 } 494 }
495 pci_bus_add_devices(bus);
494} 496}
495 497
496void __init pcibios_set_cache_line_size(void) 498void __init pcibios_set_cache_line_size(void)
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index d143d216d52b..d7f997f7c26d 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -67,7 +67,7 @@ void __init efi_bgrt_init(void)
67 67
68 image = efi_lookup_mapped_addr(bgrt_tab->image_address); 68 image = efi_lookup_mapped_addr(bgrt_tab->image_address);
69 if (!image) { 69 if (!image) {
70 image = early_memremap(bgrt_tab->image_address, 70 image = early_ioremap(bgrt_tab->image_address,
71 sizeof(bmp_header)); 71 sizeof(bmp_header));
72 ioremapped = true; 72 ioremapped = true;
73 if (!image) { 73 if (!image) {
@@ -89,7 +89,7 @@ void __init efi_bgrt_init(void)
89 } 89 }
90 90
91 if (ioremapped) { 91 if (ioremapped) {
92 image = early_memremap(bgrt_tab->image_address, 92 image = early_ioremap(bgrt_tab->image_address,
93 bmp_header.size); 93 bmp_header.size);
94 if (!image) { 94 if (!image) {
95 pr_err("Ignoring BGRT: failed to map image memory\n"); 95 pr_err("Ignoring BGRT: failed to map image memory\n");
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index dbc8627a5cdf..02744df576d5 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -85,12 +85,20 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
85 efi_memory_desc_t *virtual_map) 85 efi_memory_desc_t *virtual_map)
86{ 86{
87 efi_status_t status; 87 efi_status_t status;
88 unsigned long flags;
89 pgd_t *save_pgd;
88 90
89 efi_call_phys_prolog(); 91 save_pgd = efi_call_phys_prolog();
92
93 /* Disable interrupts around EFI calls: */
94 local_irq_save(flags);
90 status = efi_call_phys(efi_phys.set_virtual_address_map, 95 status = efi_call_phys(efi_phys.set_virtual_address_map,
91 memory_map_size, descriptor_size, 96 memory_map_size, descriptor_size,
92 descriptor_version, virtual_map); 97 descriptor_version, virtual_map);
93 efi_call_phys_epilog(); 98 local_irq_restore(flags);
99
100 efi_call_phys_epilog(save_pgd);
101
94 return status; 102 return status;
95} 103}
96 104
@@ -491,7 +499,8 @@ void __init efi_init(void)
491 if (efi_memmap_init()) 499 if (efi_memmap_init())
492 return; 500 return;
493 501
494 print_efi_memmap(); 502 if (efi_enabled(EFI_DBG))
503 print_efi_memmap();
495} 504}
496 505
497void __init efi_late_init(void) 506void __init efi_late_init(void)
@@ -939,6 +948,8 @@ static int __init arch_parse_efi_cmdline(char *str)
939{ 948{
940 if (parse_option_str(str, "old_map")) 949 if (parse_option_str(str, "old_map"))
941 set_bit(EFI_OLD_MEMMAP, &efi.flags); 950 set_bit(EFI_OLD_MEMMAP, &efi.flags);
951 if (parse_option_str(str, "debug"))
952 set_bit(EFI_DBG, &efi.flags);
942 953
943 return 0; 954 return 0;
944} 955}
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 40e7cda52936..ed5b67338294 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -33,11 +33,10 @@
33 33
34/* 34/*
35 * To make EFI call EFI runtime service in physical addressing mode we need 35 * To make EFI call EFI runtime service in physical addressing mode we need
36 * prolog/epilog before/after the invocation to disable interrupt, to 36 * prolog/epilog before/after the invocation to claim the EFI runtime service
37 * claim EFI runtime service handler exclusively and to duplicate a memory in 37 * handler exclusively and to duplicate a memory mapping in low memory space,
38 * low memory space say 0 - 3G. 38 * say 0 - 3G.
39 */ 39 */
40static unsigned long efi_rt_eflags;
41 40
42void efi_sync_low_kernel_mappings(void) {} 41void efi_sync_low_kernel_mappings(void) {}
43void __init efi_dump_pagetable(void) {} 42void __init efi_dump_pagetable(void) {}
@@ -57,21 +56,24 @@ void __init efi_map_region(efi_memory_desc_t *md)
57void __init efi_map_region_fixed(efi_memory_desc_t *md) {} 56void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
58void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} 57void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
59 58
60void __init efi_call_phys_prolog(void) 59pgd_t * __init efi_call_phys_prolog(void)
61{ 60{
62 struct desc_ptr gdt_descr; 61 struct desc_ptr gdt_descr;
62 pgd_t *save_pgd;
63 63
64 local_irq_save(efi_rt_eflags); 64 /* Current pgd is swapper_pg_dir, we'll restore it later: */
65 65 save_pgd = swapper_pg_dir;
66 load_cr3(initial_page_table); 66 load_cr3(initial_page_table);
67 __flush_tlb_all(); 67 __flush_tlb_all();
68 68
69 gdt_descr.address = __pa(get_cpu_gdt_table(0)); 69 gdt_descr.address = __pa(get_cpu_gdt_table(0));
70 gdt_descr.size = GDT_SIZE - 1; 70 gdt_descr.size = GDT_SIZE - 1;
71 load_gdt(&gdt_descr); 71 load_gdt(&gdt_descr);
72
73 return save_pgd;
72} 74}
73 75
74void __init efi_call_phys_epilog(void) 76void __init efi_call_phys_epilog(pgd_t *save_pgd)
75{ 77{
76 struct desc_ptr gdt_descr; 78 struct desc_ptr gdt_descr;
77 79
@@ -79,10 +81,8 @@ void __init efi_call_phys_epilog(void)
79 gdt_descr.size = GDT_SIZE - 1; 81 gdt_descr.size = GDT_SIZE - 1;
80 load_gdt(&gdt_descr); 82 load_gdt(&gdt_descr);
81 83
82 load_cr3(swapper_pg_dir); 84 load_cr3(save_pgd);
83 __flush_tlb_all(); 85 __flush_tlb_all();
84
85 local_irq_restore(efi_rt_eflags);
86} 86}
87 87
88void __init efi_runtime_mkexec(void) 88void __init efi_runtime_mkexec(void)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 17e80d829df0..a0ac0f9c307f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,9 +41,6 @@
41#include <asm/realmode.h> 41#include <asm/realmode.h>
42#include <asm/time.h> 42#include <asm/time.h>
43 43
44static pgd_t *save_pgd __initdata;
45static unsigned long efi_flags __initdata;
46
47/* 44/*
48 * We allocate runtime services regions bottom-up, starting from -4G, i.e. 45 * We allocate runtime services regions bottom-up, starting from -4G, i.e.
49 * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. 46 * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
@@ -78,17 +75,18 @@ static void __init early_code_mapping_set_exec(int executable)
78 } 75 }
79} 76}
80 77
81void __init efi_call_phys_prolog(void) 78pgd_t * __init efi_call_phys_prolog(void)
82{ 79{
83 unsigned long vaddress; 80 unsigned long vaddress;
81 pgd_t *save_pgd;
82
84 int pgd; 83 int pgd;
85 int n_pgds; 84 int n_pgds;
86 85
87 if (!efi_enabled(EFI_OLD_MEMMAP)) 86 if (!efi_enabled(EFI_OLD_MEMMAP))
88 return; 87 return NULL;
89 88
90 early_code_mapping_set_exec(1); 89 early_code_mapping_set_exec(1);
91 local_irq_save(efi_flags);
92 90
93 n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); 91 n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
94 save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); 92 save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
@@ -99,24 +97,29 @@ void __init efi_call_phys_prolog(void)
99 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); 97 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
100 } 98 }
101 __flush_tlb_all(); 99 __flush_tlb_all();
100
101 return save_pgd;
102} 102}
103 103
104void __init efi_call_phys_epilog(void) 104void __init efi_call_phys_epilog(pgd_t *save_pgd)
105{ 105{
106 /* 106 /*
107 * After the lock is released, the original page table is restored. 107 * After the lock is released, the original page table is restored.
108 */ 108 */
109 int pgd; 109 int pgd_idx;
110 int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); 110 int nr_pgds;
111 111
112 if (!efi_enabled(EFI_OLD_MEMMAP)) 112 if (!save_pgd)
113 return; 113 return;
114 114
115 for (pgd = 0; pgd < n_pgds; pgd++) 115 nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
116 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); 116
117 for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
118 set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
119
117 kfree(save_pgd); 120 kfree(save_pgd);
121
118 __flush_tlb_all(); 122 __flush_tlb_all();
119 local_irq_restore(efi_flags);
120 early_code_mapping_set_exec(0); 123 early_code_mapping_set_exec(0);
121} 124}
122 125
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index c9a0838890e2..278e4da4222f 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <asm-generic/sections.h> 13#include <asm-generic/sections.h>
14#include <asm/cpu_device_id.h>
14#include <asm/imr.h> 15#include <asm/imr.h>
15#include <linux/init.h> 16#include <linux/init.h>
16#include <linux/mm.h> 17#include <linux/mm.h>
@@ -101,6 +102,12 @@ static void __init imr_self_test(void)
101 } 102 }
102} 103}
103 104
105static const struct x86_cpu_id imr_ids[] __initconst = {
106 { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
107 {}
108};
109MODULE_DEVICE_TABLE(x86cpu, imr_ids);
110
104/** 111/**
105 * imr_self_test_init - entry point for IMR driver. 112 * imr_self_test_init - entry point for IMR driver.
106 * 113 *
@@ -108,7 +115,8 @@ static void __init imr_self_test(void)
108 */ 115 */
109static int __init imr_self_test_init(void) 116static int __init imr_self_test_init(void)
110{ 117{
111 imr_self_test(); 118 if (x86_match_cpu(imr_ids))
119 imr_self_test();
112 return 0; 120 return 0;
113} 121}
114 122
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 9a2e590dd202..7fa8b3b53bc0 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -61,7 +61,7 @@ static void battery_status_changed(void)
61 61
62 if (psy) { 62 if (psy) {
63 power_supply_changed(psy); 63 power_supply_changed(psy);
64 put_device(psy->dev); 64 power_supply_put(psy);
65 } 65 }
66} 66}
67 67
@@ -71,7 +71,7 @@ static void ac_status_changed(void)
71 71
72 if (psy) { 72 if (psy) {
73 power_supply_changed(psy); 73 power_supply_changed(psy);
74 put_device(psy->dev); 74 power_supply_put(psy);
75 } 75 }
76} 76}
77 77
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 08e350e757dc..55130846ac87 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -83,7 +83,7 @@ static void battery_status_changed(void)
83 83
84 if (psy) { 84 if (psy) {
85 power_supply_changed(psy); 85 power_supply_changed(psy);
86 put_device(psy->dev); 86 power_supply_put(psy);
87 } 87 }
88} 88}
89 89
@@ -93,7 +93,7 @@ static void ac_status_changed(void)
93 93
94 if (psy) { 94 if (psy) {
95 power_supply_changed(psy); 95 power_supply_changed(psy);
96 put_device(psy->dev); 96 power_supply_put(psy);
97 } 97 }
98} 98}
99 99
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 994798548b1a..3b6ec42718e4 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
415 struct reset_args reset_args; 415 struct reset_args reset_args;
416 416
417 reset_args.sender = sender; 417 reset_args.sender = sender;
418 cpus_clear(*mask); 418 cpumask_clear(mask);
419 /* find a single cpu for each uvhub in this distribution mask */ 419 /* find a single cpu for each uvhub in this distribution mask */
420 maskbits = sizeof(struct pnmask) * BITSPERBYTE; 420 maskbits = sizeof(struct pnmask) * BITSPERBYTE;
421 /* each bit is a pnode relative to the partition base pnode */ 421 /* each bit is a pnode relative to the partition base pnode */
@@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
425 continue; 425 continue;
426 apnode = pnode + bcp->partition_base_pnode; 426 apnode = pnode + bcp->partition_base_pnode;
427 cpu = pnode_to_first_cpu(apnode, smaster); 427 cpu = pnode_to_first_cpu(apnode, smaster);
428 cpu_set(cpu, *mask); 428 cpumask_set_cpu(cpu, mask);
429 } 429 }
430 430
431 /* IPI all cpus; preemption is already disabled */ 431 /* IPI all cpus; preemption is already disabled */
@@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1126 /* don't actually do a shootdown of the local cpu */ 1126 /* don't actually do a shootdown of the local cpu */
1127 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 1127 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
1128 1128
1129 if (cpu_isset(cpu, *cpumask)) 1129 if (cpumask_test_cpu(cpu, cpumask))
1130 stat->s_ntargself++; 1130 stat->s_ntargself++;
1131 1131
1132 bau_desc = bcp->descriptor_base; 1132 bau_desc = bcp->descriptor_base;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 3e32ed5648a0..757678fb26e1 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -134,7 +134,7 @@ static void do_fpu_end(void)
134static void fix_processor_context(void) 134static void fix_processor_context(void)
135{ 135{
136 int cpu = smp_processor_id(); 136 int cpu = smp_processor_id();
137 struct tss_struct *t = &per_cpu(init_tss, cpu); 137 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
138#ifdef CONFIG_X86_64 138#ifdef CONFIG_X86_64
139 struct desc_struct *desc = get_cpu_gdt_table(cpu); 139 struct desc_struct *desc = get_cpu_gdt_table(cpu);
140 tss_desc tss; 140 tss_desc tss;
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
index 3323c2745248..a55abb9f6c5e 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/syscalls/Makefile
@@ -19,6 +19,9 @@ quiet_cmd_syshdr = SYSHDR $@
19quiet_cmd_systbl = SYSTBL $@ 19quiet_cmd_systbl = SYSTBL $@
20 cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ 20 cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
21 21
22quiet_cmd_hypercalls = HYPERCALLS $@
23 cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^)
24
22syshdr_abi_unistd_32 := i386 25syshdr_abi_unistd_32 := i386
23$(uapi)/unistd_32.h: $(syscall32) $(syshdr) 26$(uapi)/unistd_32.h: $(syscall32) $(syshdr)
24 $(call if_changed,syshdr) 27 $(call if_changed,syshdr)
@@ -47,10 +50,16 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl)
47$(out)/syscalls_64.h: $(syscall64) $(systbl) 50$(out)/syscalls_64.h: $(syscall64) $(systbl)
48 $(call if_changed,systbl) 51 $(call if_changed,systbl)
49 52
53$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh
54 $(call if_changed,hypercalls)
55
56$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
57
50uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h 58uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h
51syshdr-y += syscalls_32.h 59syshdr-y += syscalls_32.h
52syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h 60syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
53syshdr-$(CONFIG_X86_64) += syscalls_64.h 61syshdr-$(CONFIG_X86_64) += syscalls_64.h
62syshdr-$(CONFIG_XEN) += xen-hypercalls.h
54 63
55targets += $(uapisyshdr-y) $(syshdr-y) 64targets += $(uapisyshdr-y) $(syshdr-y)
56 65
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index b3560ece1c9f..ef8187f9d28d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
119110 i386 iopl sys_iopl 119110 i386 iopl sys_iopl
120111 i386 vhangup sys_vhangup 120111 i386 vhangup sys_vhangup
121112 i386 idle 121112 i386 idle
122113 i386 vm86old sys_vm86old sys32_vm86_warning 122113 i386 vm86old sys_vm86old sys_ni_syscall
123114 i386 wait4 sys_wait4 compat_sys_wait4 123114 i386 wait4 sys_wait4 compat_sys_wait4
124115 i386 swapoff sys_swapoff 124115 i386 swapoff sys_swapoff
125116 i386 sysinfo sys_sysinfo compat_sys_sysinfo 125116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
@@ -172,7 +172,7 @@
172163 i386 mremap sys_mremap 172163 i386 mremap sys_mremap
173164 i386 setresuid sys_setresuid16 173164 i386 setresuid sys_setresuid16
174165 i386 getresuid sys_getresuid16 174165 i386 getresuid sys_getresuid16
175166 i386 vm86 sys_vm86 sys32_vm86_warning 175166 i386 vm86 sys_vm86 sys_ni_syscall
176167 i386 query_module 176167 i386 query_module
177168 i386 poll sys_poll 177168 i386 poll sys_poll
178169 i386 nfsservctl 178169 i386 nfsservctl
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fbb57aa..9ef32d5f1b19 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
178169 common reboot sys_reboot 178169 common reboot sys_reboot
179170 common sethostname sys_sethostname 179170 common sethostname sys_sethostname
180171 common setdomainname sys_setdomainname 180171 common setdomainname sys_setdomainname
181172 common iopl stub_iopl 181172 common iopl sys_iopl
182173 common ioperm sys_ioperm 182173 common ioperm sys_ioperm
183174 64 create_module 183174 64 create_module
184175 common init_module sys_init_module 184175 common init_module sys_init_module
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index eafa324eb7a5..acb384d24669 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_BINFMT_ELF) += elfcore.o
21 21
22subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o 22subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
23subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o 23subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
24subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o
25 24
26else 25else
27 26
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..7e8a1a650435 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -36,22 +36,11 @@
36#endif /* CONFIG_X86_PPRO_FENCE */ 36#endif /* CONFIG_X86_PPRO_FENCE */
37#define dma_wmb() barrier() 37#define dma_wmb() barrier()
38 38
39#ifdef CONFIG_SMP
40
41#define smp_mb() mb()
42#define smp_rmb() dma_rmb()
43#define smp_wmb() barrier()
44#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
45
46#else /* CONFIG_SMP */
47
48#define smp_mb() barrier() 39#define smp_mb() barrier()
49#define smp_rmb() barrier() 40#define smp_rmb() barrier()
50#define smp_wmb() barrier() 41#define smp_wmb() barrier()
51#define set_mb(var, value) do { var = value; barrier(); } while (0) 42#define set_mb(var, value) do { var = value; barrier(); } while (0)
52 43
53#endif /* CONFIG_SMP */
54
55#define read_barrier_depends() do { } while (0) 44#define read_barrier_depends() do { } while (0)
56#define smp_read_barrier_depends() do { } while (0) 45#define smp_read_barrier_depends() do { } while (0)
57 46
@@ -64,8 +53,8 @@
64 */ 53 */
65static inline void rdtsc_barrier(void) 54static inline void rdtsc_barrier(void)
66{ 55{
67 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); 56 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
68 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 57 "lfence", X86_FEATURE_LFENCE_RDTSC);
69} 58}
70 59
71#endif 60#endif
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 25a1022dd793..0a656b727b1a 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -210,7 +210,7 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
210 210
211#define ELF_EXEC_PAGESIZE 4096 211#define ELF_EXEC_PAGESIZE 4096
212 212
213#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) 213#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
214 214
215extern long elf_aux_hwcap; 215extern long elf_aux_hwcap;
216#define ELF_HWCAP (elf_aux_hwcap) 216#define ELF_HWCAP (elf_aux_hwcap)
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 8e08176f0bcb..5c0b711d2433 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -8,9 +8,7 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <asm/unistd.h> 9#include <asm/unistd.h>
10#include <os.h> 10#include <os.h>
11#include <proc_mm.h>
12#include <skas.h> 11#include <skas.h>
13#include <skas_ptrace.h>
14#include <sysdep/tls.h> 12#include <sysdep/tls.h>
15 13
16extern int modify_ldt(int func, void *ptr, unsigned long bytecount); 14extern int modify_ldt(int func, void *ptr, unsigned long bytecount);
@@ -19,105 +17,20 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func,
19 struct user_desc *desc, void **addr, int done) 17 struct user_desc *desc, void **addr, int done)
20{ 18{
21 long res; 19 long res;
22 20 void *stub_addr;
23 if (proc_mm) { 21 res = syscall_stub_data(mm_idp, (unsigned long *)desc,
24 /* 22 (sizeof(*desc) + sizeof(long) - 1) &
25 * This is a special handling for the case, that the mm to 23 ~(sizeof(long) - 1),
26 * modify isn't current->active_mm. 24 addr, &stub_addr);
27 * If this is called directly by modify_ldt, 25 if (!res) {
28 * (current->active_mm->context.skas.u == mm_idp) 26 unsigned long args[] = { func,
29 * will be true. So no call to __switch_mm(mm_idp) is done. 27 (unsigned long)stub_addr,
30 * If this is called in case of init_new_ldt or PTRACE_LDT, 28 sizeof(*desc),
31 * mm_idp won't belong to current->active_mm, but child->mm. 29 0, 0, 0 };
32 * So we need to switch child's mm into our userspace, then 30 res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
33 * later switch back. 31 0, addr, done);
34 *
35 * Note: I'm unsure: should interrupts be disabled here?
36 */
37 if (!current->active_mm || current->active_mm == &init_mm ||
38 mm_idp != &current->active_mm->context.id)
39 __switch_mm(mm_idp);
40 }
41
42 if (ptrace_ldt) {
43 struct ptrace_ldt ldt_op = (struct ptrace_ldt) {
44 .func = func,
45 .ptr = desc,
46 .bytecount = sizeof(*desc)};
47 u32 cpu;
48 int pid;
49
50 if (!proc_mm)
51 pid = mm_idp->u.pid;
52 else {
53 cpu = get_cpu();
54 pid = userspace_pid[cpu];
55 }
56
57 res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op);
58
59 if (proc_mm)
60 put_cpu();
61 }
62 else {
63 void *stub_addr;
64 res = syscall_stub_data(mm_idp, (unsigned long *)desc,
65 (sizeof(*desc) + sizeof(long) - 1) &
66 ~(sizeof(long) - 1),
67 addr, &stub_addr);
68 if (!res) {
69 unsigned long args[] = { func,
70 (unsigned long)stub_addr,
71 sizeof(*desc),
72 0, 0, 0 };
73 res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
74 0, addr, done);
75 }
76 } 32 }
77 33
78 if (proc_mm) {
79 /*
80 * This is the second part of special handling, that makes
81 * PTRACE_LDT possible to implement.
82 */
83 if (current->active_mm && current->active_mm != &init_mm &&
84 mm_idp != &current->active_mm->context.id)
85 __switch_mm(&current->active_mm->context.id);
86 }
87
88 return res;
89}
90
91static long read_ldt_from_host(void __user * ptr, unsigned long bytecount)
92{
93 int res, n;
94 struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) {
95 .func = 0,
96 .bytecount = bytecount,
97 .ptr = kmalloc(bytecount, GFP_KERNEL)};
98 u32 cpu;
99
100 if (ptrace_ldt.ptr == NULL)
101 return -ENOMEM;
102
103 /*
104 * This is called from sys_modify_ldt only, so userspace_pid gives
105 * us the right number
106 */
107
108 cpu = get_cpu();
109 res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt);
110 put_cpu();
111 if (res < 0)
112 goto out;
113
114 n = copy_to_user(ptr, ptrace_ldt.ptr, res);
115 if (n != 0)
116 res = -EFAULT;
117
118 out:
119 kfree(ptrace_ldt.ptr);
120
121 return res; 34 return res;
122} 35}
123 36
@@ -145,9 +58,6 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
145 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; 58 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
146 err = bytecount; 59 err = bytecount;
147 60
148 if (ptrace_ldt)
149 return read_ldt_from_host(ptr, bytecount);
150
151 mutex_lock(&ldt->lock); 61 mutex_lock(&ldt->lock);
152 if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { 62 if (ldt->entry_count <= LDT_DIRECT_ENTRIES) {
153 size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; 63 size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES;
@@ -229,17 +139,11 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
229 goto out; 139 goto out;
230 } 140 }
231 141
232 if (!ptrace_ldt) 142 mutex_lock(&ldt->lock);
233 mutex_lock(&ldt->lock);
234 143
235 err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); 144 err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1);
236 if (err) 145 if (err)
237 goto out_unlock; 146 goto out_unlock;
238 else if (ptrace_ldt) {
239 /* With PTRACE_LDT available, this is used as a flag only */
240 ldt->entry_count = 1;
241 goto out;
242 }
243 147
244 if (ldt_info.entry_number >= ldt->entry_count && 148 if (ldt_info.entry_number >= ldt->entry_count &&
245 ldt_info.entry_number >= LDT_DIRECT_ENTRIES) { 149 ldt_info.entry_number >= LDT_DIRECT_ENTRIES) {
@@ -393,91 +297,56 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
393 int i; 297 int i;
394 long page, err=0; 298 long page, err=0;
395 void *addr = NULL; 299 void *addr = NULL;
396 struct proc_mm_op copy;
397 300
398 301
399 if (!ptrace_ldt) 302 mutex_init(&new_mm->arch.ldt.lock);
400 mutex_init(&new_mm->arch.ldt.lock);
401 303
402 if (!from_mm) { 304 if (!from_mm) {
403 memset(&desc, 0, sizeof(desc)); 305 memset(&desc, 0, sizeof(desc));
404 /* 306 /*
405 * We have to initialize a clean ldt. 307 * Now we try to retrieve info about the ldt, we
308 * inherited from the host. All ldt-entries found
309 * will be reset in the following loop
406 */ 310 */
407 if (proc_mm) { 311 ldt_get_host_info();
408 /* 312 for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
409 * If the new mm was created using proc_mm, host's 313 desc.entry_number = *num_p;
410 * default-ldt currently is assigned, which normally 314 err = write_ldt_entry(&new_mm->id, 1, &desc,
411 * contains the call-gates for lcall7 and lcall27. 315 &addr, *(num_p + 1) == -1);
412 * To remove these gates, we simply write an empty 316 if (err)
413 * entry as number 0 to the host. 317 break;
414 */
415 err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1);
416 }
417 else{
418 /*
419 * Now we try to retrieve info about the ldt, we
420 * inherited from the host. All ldt-entries found
421 * will be reset in the following loop
422 */
423 ldt_get_host_info();
424 for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
425 desc.entry_number = *num_p;
426 err = write_ldt_entry(&new_mm->id, 1, &desc,
427 &addr, *(num_p + 1) == -1);
428 if (err)
429 break;
430 }
431 } 318 }
432 new_mm->arch.ldt.entry_count = 0; 319 new_mm->arch.ldt.entry_count = 0;
433 320
434 goto out; 321 goto out;
435 } 322 }
436 323
437 if (proc_mm) { 324 /*
438 /* 325 * Our local LDT is used to supply the data for
439 * We have a valid from_mm, so we now have to copy the LDT of 326 * modify_ldt(READLDT), if PTRACE_LDT isn't available,
440 * from_mm to new_mm, because using proc_mm an new mm with 327 * i.e., we have to use the stub for modify_ldt, which
441 * an empty/default LDT was created in new_mm() 328 * can't handle the big read buffer of up to 64kB.
442 */ 329 */
443 copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS, 330 mutex_lock(&from_mm->arch.ldt.lock);
444 .u = 331 if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
445 { .copy_segments = 332 memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
446 from_mm->id.u.mm_fd } } ); 333 sizeof(new_mm->arch.ldt.u.entries));
447 i = os_write_file(new_mm->id.u.mm_fd, &copy, sizeof(copy)); 334 else {
448 if (i != sizeof(copy)) 335 i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
449 printk(KERN_ERR "new_mm : /proc/mm copy_segments " 336 while (i-->0) {
450 "failed, err = %d\n", -i); 337 page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
451 } 338 if (!page) {
452 339 err = -ENOMEM;
453 if (!ptrace_ldt) { 340 break;
454 /*
455 * Our local LDT is used to supply the data for
456 * modify_ldt(READLDT), if PTRACE_LDT isn't available,
457 * i.e., we have to use the stub for modify_ldt, which
458 * can't handle the big read buffer of up to 64kB.
459 */
460 mutex_lock(&from_mm->arch.ldt.lock);
461 if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
462 memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
463 sizeof(new_mm->arch.ldt.u.entries));
464 else {
465 i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
466 while (i-->0) {
467 page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
468 if (!page) {
469 err = -ENOMEM;
470 break;
471 }
472 new_mm->arch.ldt.u.pages[i] =
473 (struct ldt_entry *) page;
474 memcpy(new_mm->arch.ldt.u.pages[i],
475 from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
476 } 341 }
342 new_mm->arch.ldt.u.pages[i] =
343 (struct ldt_entry *) page;
344 memcpy(new_mm->arch.ldt.u.pages[i],
345 from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
477 } 346 }
478 new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
479 mutex_unlock(&from_mm->arch.ldt.lock);
480 } 347 }
348 new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
349 mutex_unlock(&from_mm->arch.ldt.lock);
481 350
482 out: 351 out:
483 return err; 352 return err;
@@ -488,7 +357,7 @@ void free_ldt(struct mm_context *mm)
488{ 357{
489 int i; 358 int i;
490 359
491 if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { 360 if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
492 i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; 361 i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
493 while (i-- > 0) 362 while (i-- > 0)
494 free_page((long) mm->arch.ldt.u.pages[i]); 363 free_page((long) mm->arch.ldt.u.pages[i]);
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
index a26086b8a800..b6f2437ec29c 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_32.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -27,9 +27,6 @@ struct faultinfo {
27/* This is Page Fault */ 27/* This is Page Fault */
28#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) 28#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14)
29 29
30/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */
31#define SEGV_MAYBE_FIXABLE(fi) ((fi)->trap_no == 0 && ptrace_faultinfo)
32
33#define PTRACE_FULL_FAULTINFO 0 30#define PTRACE_FULL_FAULTINFO 0
34 31
35#endif 32#endif
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
index f811cbe15d62..ee88f88974ea 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_64.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -27,9 +27,6 @@ struct faultinfo {
27/* This is Page Fault */ 27/* This is Page Fault */
28#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) 28#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14)
29 29
30/* No broken SKAS API, which doesn't pass trap_no, here. */
31#define SEGV_MAYBE_FIXABLE(fi) 0
32
33#define PTRACE_FULL_FAULTINFO 1 30#define PTRACE_FULL_FAULTINFO 1
34 31
35#endif 32#endif
diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h
deleted file mode 100644
index 453febe98993..000000000000
--- a/arch/x86/um/shared/sysdep/skas_ptrace.h
+++ /dev/null
@@ -1,22 +0,0 @@
1/*
2 * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
3 * Licensed under the GPL
4 */
5
6#ifndef __SYSDEP_X86_SKAS_PTRACE_H
7#define __SYSDEP_X86_SKAS_PTRACE_H
8
9struct ptrace_faultinfo {
10 int is_write;
11 unsigned long addr;
12};
13
14struct ptrace_ldt {
15 int func;
16 void *ptr;
17 unsigned long bytecount;
18};
19
20#define PTRACE_LDT 54
21
22#endif
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 0c8c32bfd792..592491d1d70d 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -549,13 +549,6 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
549 if (err) 549 if (err)
550 return err; 550 return err;
551 551
552 /* Set up registers for signal handler */
553 {
554 struct exec_domain *ed = current_thread_info()->exec_domain;
555 if (unlikely(ed && ed->signal_invmap && sig < 32))
556 sig = ed->signal_invmap[sig];
557 }
558
559 PT_REGS_SP(regs) = (unsigned long) frame; 552 PT_REGS_SP(regs) = (unsigned long) frame;
560 PT_REGS_DI(regs) = sig; 553 PT_REGS_DI(regs) = sig;
561 /* In case the signal handler was declared without prototypes */ 554 /* In case the signal handler was declared without prototypes */
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 5cdfa9db2217..a75d8700472a 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -16,7 +16,7 @@
16 */ 16 */
17 17
18/* Not going to be implemented by UML, since we have no hardware. */ 18/* Not going to be implemented by UML, since we have no hardware. */
19#define stub_iopl sys_ni_syscall 19#define sys_iopl sys_ni_syscall
20#define sys_ioperm sys_ni_syscall 20#define sys_ioperm sys_ni_syscall
21 21
22/* 22/*
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 7b9be9822724..275a3a8b78af 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -51,7 +51,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
51$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE 51$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
52 $(call if_changed,vdso) 52 $(call if_changed,vdso)
53 53
54HOST_EXTRACFLAGS += -I$(srctree)/tools/include 54HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi
55hostprogs-y += vdso2c 55hostprogs-y += vdso2c
56 56
57quiet_cmd_vdso2c = VDSO2C $@ 57quiet_cmd_vdso2c = VDSO2C $@
@@ -206,4 +206,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
206PHONY += vdso_install $(vdso_img_insttargets) 206PHONY += vdso_install $(vdso_img_insttargets)
207vdso_install: $(vdso_img_insttargets) FORCE 207vdso_install: $(vdso_img_insttargets) FORCE
208 208
209clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* 209clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 9793322751e0..40d2473836c9 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode)
82 cycle_t ret; 82 cycle_t ret;
83 u64 last; 83 u64 last;
84 u32 version; 84 u32 version;
85 u32 migrate_count;
85 u8 flags; 86 u8 flags;
86 unsigned cpu, cpu1; 87 unsigned cpu, cpu1;
87 88
88 89
89 /* 90 /*
90 * Note: hypervisor must guarantee that: 91 * When looping to get a consistent (time-info, tsc) pair, we
91 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. 92 * also need to deal with the possibility we can switch vcpus,
92 * 2. that per-CPU pvclock time info is updated if the 93 * so make sure we always re-fetch time-info for the current vcpu.
93 * underlying CPU changes.
94 * 3. that version is increased whenever underlying CPU
95 * changes.
96 *
97 */ 94 */
98 do { 95 do {
99 cpu = __getcpu() & VGETCPU_CPU_MASK; 96 cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -102,20 +99,27 @@ static notrace cycle_t vread_pvclock(int *mode)
102 * __getcpu() calls (Gleb). 99 * __getcpu() calls (Gleb).
103 */ 100 */
104 101
105 pvti = get_pvti(cpu); 102 /* Make sure migrate_count will change if we leave the VCPU. */
103 do {
104 pvti = get_pvti(cpu);
105 migrate_count = pvti->migrate_count;
106
107 cpu1 = cpu;
108 cpu = __getcpu() & VGETCPU_CPU_MASK;
109 } while (unlikely(cpu != cpu1));
106 110
107 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); 111 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
108 112
109 /* 113 /*
110 * Test we're still on the cpu as well as the version. 114 * Test we're still on the cpu as well as the version.
111 * We could have been migrated just after the first 115 * - We must read TSC of pvti's VCPU.
112 * vgetcpu but before fetching the version, so we 116 * - KVM doesn't follow the versioning protocol, so data could
113 * wouldn't notice a version change. 117 * change before version if we left the VCPU.
114 */ 118 */
115 cpu1 = __getcpu() & VGETCPU_CPU_MASK; 119 smp_rmb();
116 } while (unlikely(cpu != cpu1 || 120 } while (unlikely((pvti->pvti.version & 1) ||
117 (pvti->pvti.version & 1) || 121 pvti->pvti.version != version ||
118 pvti->pvti.version != version)); 122 pvti->migrate_count != migrate_count));
119 123
120 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) 124 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
121 *mode = VCLOCK_NONE; 125 *mode = VCLOCK_NONE;
diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S
index 5415b5613d55..6b286bb5251c 100644
--- a/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -19,8 +19,6 @@ __kernel_vsyscall:
19.Lpush_ebp: 19.Lpush_ebp:
20 movl %ecx, %ebp 20 movl %ecx, %ebp
21 syscall 21 syscall
22 movl $__USER32_DS, %ecx
23 movl %ecx, %ss
24 movl %ebp, %ecx 22 movl %ebp, %ecx
25 popl %ebp 23 popl %ebp
26.Lpop_ebp: 24.Lpop_ebp:
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 7005ced5d1ad..70e060ad879a 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
7#include <xen/xen.h> 7#include <xen/xen.h>
8#include <xen/interface/physdev.h> 8#include <xen/interface/physdev.h>
9#include "xen-ops.h" 9#include "xen-ops.h"
10#include "smp.h"
10 11
11static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) 12static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
12{ 13{
@@ -28,7 +29,186 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
28 return 0xfd; 29 return 0xfd;
29} 30}
30 31
32static unsigned long xen_set_apic_id(unsigned int x)
33{
34 WARN_ON(1);
35 return x;
36}
37
38static unsigned int xen_get_apic_id(unsigned long x)
39{
40 return ((x)>>24) & 0xFFu;
41}
42
43static u32 xen_apic_read(u32 reg)
44{
45 struct xen_platform_op op = {
46 .cmd = XENPF_get_cpuinfo,
47 .interface_version = XENPF_INTERFACE_VERSION,
48 .u.pcpu_info.xen_cpuid = 0,
49 };
50 int ret = 0;
51
52 /* Shouldn't need this as APIC is turned off for PV, and we only
53 * get called on the bootup processor. But just in case. */
54 if (!xen_initial_domain() || smp_processor_id())
55 return 0;
56
57 if (reg == APIC_LVR)
58 return 0x10;
59#ifdef CONFIG_X86_32
60 if (reg == APIC_LDR)
61 return SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
62#endif
63 if (reg != APIC_ID)
64 return 0;
65
66 ret = HYPERVISOR_dom0_op(&op);
67 if (ret)
68 return 0;
69
70 return op.u.pcpu_info.apic_id << 24;
71}
72
73static void xen_apic_write(u32 reg, u32 val)
74{
75 /* Warn to see if there's any stray references */
76 WARN(1,"register: %x, value: %x\n", reg, val);
77}
78
79static u64 xen_apic_icr_read(void)
80{
81 return 0;
82}
83
84static void xen_apic_icr_write(u32 low, u32 id)
85{
86 /* Warn to see if there's any stray references */
87 WARN_ON(1);
88}
89
90static u32 xen_safe_apic_wait_icr_idle(void)
91{
92 return 0;
93}
94
95static int xen_apic_probe_pv(void)
96{
97 if (xen_pv_domain())
98 return 1;
99
100 return 0;
101}
102
103static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
104{
105 return xen_pv_domain();
106}
107
108static int xen_id_always_valid(int apicid)
109{
110 return 1;
111}
112
113static int xen_id_always_registered(void)
114{
115 return 1;
116}
117
118static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
119{
120 return initial_apic_id >> index_msb;
121}
122
123#ifdef CONFIG_X86_32
124static int xen_x86_32_early_logical_apicid(int cpu)
125{
126 /* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */
127 return 1 << cpu;
128}
129#endif
130
131static void xen_noop(void)
132{
133}
134
135static void xen_silent_inquire(int apicid)
136{
137}
138
139static struct apic xen_pv_apic = {
140 .name = "Xen PV",
141 .probe = xen_apic_probe_pv,
142 .acpi_madt_oem_check = xen_madt_oem_check,
143 .apic_id_valid = xen_id_always_valid,
144 .apic_id_registered = xen_id_always_registered,
145
146 /* .irq_delivery_mode - used in native_compose_msi_msg only */
147 /* .irq_dest_mode - used in native_compose_msi_msg only */
148
149 .target_cpus = default_target_cpus,
150 .disable_esr = 0,
151 /* .dest_logical - default_send_IPI_ use it but we use our own. */
152 .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
153
154 .vector_allocation_domain = flat_vector_allocation_domain,
155 .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */
156
157 .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */
158 .setup_apic_routing = NULL,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */
161 .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */
162 .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */
163
164 .get_apic_id = xen_get_apic_id,
165 .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
166 .apic_id_mask = 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */
167
168 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
169
170#ifdef CONFIG_SMP
171 .send_IPI_mask = xen_send_IPI_mask,
172 .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
173 .send_IPI_allbutself = xen_send_IPI_allbutself,
174 .send_IPI_all = xen_send_IPI_all,
175 .send_IPI_self = xen_send_IPI_self,
176#endif
177 /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */
178 .inquire_remote_apic = xen_silent_inquire,
179
180 .read = xen_apic_read,
181 .write = xen_apic_write,
182 .eoi_write = xen_apic_write,
183
184 .icr_read = xen_apic_icr_read,
185 .icr_write = xen_apic_icr_write,
186 .wait_icr_idle = xen_noop,
187 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
188
189#ifdef CONFIG_X86_32
190 /* generic_processor_info and setup_local_APIC. */
191 .x86_32_early_logical_apicid = xen_x86_32_early_logical_apicid,
192#endif
193};
194
195static void __init xen_apic_check(void)
196{
197 if (apic == &xen_pv_apic)
198 return;
199
200 pr_info("Switched APIC routing from %s to %s.\n", apic->name,
201 xen_pv_apic.name);
202 apic = &xen_pv_apic;
203}
31void __init xen_init_apic(void) 204void __init xen_init_apic(void)
32{ 205{
33 x86_io_apic_ops.read = xen_io_apic_read; 206 x86_io_apic_ops.read = xen_io_apic_read;
207 /* On PV guests the APIC CPUID bit is disabled so none of the
208 * routines end up executing. */
209 if (!xen_initial_domain())
210 apic = &xen_pv_apic;
211
212 x86_platform.apic_post_init = xen_apic_check;
34} 213}
214apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f563076d..94578efd3067 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss,
912 mcs = xen_mc_entry(0); 912 mcs = xen_mc_entry(0);
913 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 913 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
914 xen_mc_issue(PARAVIRT_LAZY_CPU); 914 xen_mc_issue(PARAVIRT_LAZY_CPU);
915 tss->x86_tss.sp0 = thread->sp0;
915} 916}
916 917
917static void xen_set_iopl_mask(unsigned mask) 918static void xen_set_iopl_mask(unsigned mask)
@@ -927,92 +928,6 @@ static void xen_io_delay(void)
927{ 928{
928} 929}
929 930
930#ifdef CONFIG_X86_LOCAL_APIC
931static unsigned long xen_set_apic_id(unsigned int x)
932{
933 WARN_ON(1);
934 return x;
935}
936static unsigned int xen_get_apic_id(unsigned long x)
937{
938 return ((x)>>24) & 0xFFu;
939}
940static u32 xen_apic_read(u32 reg)
941{
942 struct xen_platform_op op = {
943 .cmd = XENPF_get_cpuinfo,
944 .interface_version = XENPF_INTERFACE_VERSION,
945 .u.pcpu_info.xen_cpuid = 0,
946 };
947 int ret = 0;
948
949 /* Shouldn't need this as APIC is turned off for PV, and we only
950 * get called on the bootup processor. But just in case. */
951 if (!xen_initial_domain() || smp_processor_id())
952 return 0;
953
954 if (reg == APIC_LVR)
955 return 0x10;
956
957 if (reg != APIC_ID)
958 return 0;
959
960 ret = HYPERVISOR_dom0_op(&op);
961 if (ret)
962 return 0;
963
964 return op.u.pcpu_info.apic_id << 24;
965}
966
967static void xen_apic_write(u32 reg, u32 val)
968{
969 /* Warn to see if there's any stray references */
970 WARN_ON(1);
971}
972
973static u64 xen_apic_icr_read(void)
974{
975 return 0;
976}
977
978static void xen_apic_icr_write(u32 low, u32 id)
979{
980 /* Warn to see if there's any stray references */
981 WARN_ON(1);
982}
983
984static void xen_apic_wait_icr_idle(void)
985{
986 return;
987}
988
989static u32 xen_safe_apic_wait_icr_idle(void)
990{
991 return 0;
992}
993
994static void set_xen_basic_apic_ops(void)
995{
996 apic->read = xen_apic_read;
997 apic->write = xen_apic_write;
998 apic->icr_read = xen_apic_icr_read;
999 apic->icr_write = xen_apic_icr_write;
1000 apic->wait_icr_idle = xen_apic_wait_icr_idle;
1001 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
1002 apic->set_apic_id = xen_set_apic_id;
1003 apic->get_apic_id = xen_get_apic_id;
1004
1005#ifdef CONFIG_SMP
1006 apic->send_IPI_allbutself = xen_send_IPI_allbutself;
1007 apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
1008 apic->send_IPI_mask = xen_send_IPI_mask;
1009 apic->send_IPI_all = xen_send_IPI_all;
1010 apic->send_IPI_self = xen_send_IPI_self;
1011#endif
1012}
1013
1014#endif
1015
1016static void xen_clts(void) 931static void xen_clts(void)
1017{ 932{
1018 struct multicall_space mcs; 933 struct multicall_space mcs;
@@ -1618,7 +1533,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
1618 /* 1533 /*
1619 * set up the basic apic ops. 1534 * set up the basic apic ops.
1620 */ 1535 */
1621 set_xen_basic_apic_ops(); 1536 xen_init_apic();
1622#endif 1537#endif
1623 1538
1624 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1539 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1731,8 +1646,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
1731 if (HYPERVISOR_dom0_op(&op) == 0) 1646 if (HYPERVISOR_dom0_op(&op) == 0)
1732 boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; 1647 boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
1733 1648
1734 xen_init_apic();
1735
1736 /* Make sure ACS will be enabled */ 1649 /* Make sure ACS will be enabled */
1737 pci_request_acs(); 1650 pci_request_acs();
1738 1651
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..dd151b2045b0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
502} 502}
503PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 503PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
504 504
505#if PAGETABLE_LEVELS == 4 505#if CONFIG_PGTABLE_LEVELS == 4
506__visible pudval_t xen_pud_val(pud_t pud) 506__visible pudval_t xen_pud_val(pud_t pud)
507{ 507{
508 return pte_mfn_to_pfn(pud.pud); 508 return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
589 589
590 xen_mc_issue(PARAVIRT_LAZY_MMU); 590 xen_mc_issue(PARAVIRT_LAZY_MMU);
591} 591}
592#endif /* PAGETABLE_LEVELS == 4 */ 592#endif /* CONFIG_PGTABLE_LEVELS == 4 */
593 593
594/* 594/*
595 * (Yet another) pagetable walker. This one is intended for pinning a 595 * (Yet another) pagetable walker. This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
1628 xen_release_ptpage(pfn, PT_PMD); 1628 xen_release_ptpage(pfn, PT_PMD);
1629} 1629}
1630 1630
1631#if PAGETABLE_LEVELS == 4 1631#if CONFIG_PGTABLE_LEVELS == 4
1632static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) 1632static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1633{ 1633{
1634 xen_alloc_ptpage(mm, pfn, PT_PUD); 1634 xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
2046 pv_mmu_ops.set_pte = xen_set_pte; 2046 pv_mmu_ops.set_pte = xen_set_pte;
2047 pv_mmu_ops.set_pmd = xen_set_pmd; 2047 pv_mmu_ops.set_pmd = xen_set_pmd;
2048 pv_mmu_ops.set_pud = xen_set_pud; 2048 pv_mmu_ops.set_pud = xen_set_pud;
2049#if PAGETABLE_LEVELS == 4 2049#if CONFIG_PGTABLE_LEVELS == 4
2050 pv_mmu_ops.set_pgd = xen_set_pgd; 2050 pv_mmu_ops.set_pgd = xen_set_pgd;
2051#endif 2051#endif
2052 2052
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
2056 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 2056 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2057 pv_mmu_ops.release_pte = xen_release_pte; 2057 pv_mmu_ops.release_pte = xen_release_pte;
2058 pv_mmu_ops.release_pmd = xen_release_pmd; 2058 pv_mmu_ops.release_pmd = xen_release_pmd;
2059#if PAGETABLE_LEVELS == 4 2059#if CONFIG_PGTABLE_LEVELS == 4
2060 pv_mmu_ops.alloc_pud = xen_alloc_pud; 2060 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2061 pv_mmu_ops.release_pud = xen_release_pud; 2061 pv_mmu_ops.release_pud = xen_release_pud;
2062#endif 2062#endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2122 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), 2122 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2123 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), 2123 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2124 2124
2125#if PAGETABLE_LEVELS == 4 2125#if CONFIG_PGTABLE_LEVELS == 4
2126 .pud_val = PV_CALLEE_SAVE(xen_pud_val), 2126 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2127 .make_pud = PV_CALLEE_SAVE(xen_make_pud), 2127 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2128 .set_pgd = xen_set_pgd_hyper, 2128 .set_pgd = xen_set_pgd_hyper,
2129 2129
2130 .alloc_pud = xen_alloc_pmd_init, 2130 .alloc_pud = xen_alloc_pmd_init,
2131 .release_pud = xen_release_pmd_init, 2131 .release_pud = xen_release_pmd_init,
2132#endif /* PAGETABLE_LEVELS == 4 */ 2132#endif /* CONFIG_PGTABLE_LEVELS == 4 */
2133 2133
2134 .activate_mm = xen_activate_mm, 2134 .activate_mm = xen_activate_mm,
2135 .dup_mmap = xen_dup_mmap, 2135 .dup_mmap = xen_dup_mmap,
@@ -2436,99 +2436,11 @@ void __init xen_hvm_init_mmu_ops(void)
2436} 2436}
2437#endif 2437#endif
2438 2438
2439#ifdef CONFIG_XEN_PVH
2440/*
2441 * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
2442 * space creating new guest on pvh dom0 and needing to map domU pages.
2443 */
2444static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
2445 unsigned int domid)
2446{
2447 int rc, err = 0;
2448 xen_pfn_t gpfn = lpfn;
2449 xen_ulong_t idx = fgfn;
2450
2451 struct xen_add_to_physmap_range xatp = {
2452 .domid = DOMID_SELF,
2453 .foreign_domid = domid,
2454 .size = 1,
2455 .space = XENMAPSPACE_gmfn_foreign,
2456 };
2457 set_xen_guest_handle(xatp.idxs, &idx);
2458 set_xen_guest_handle(xatp.gpfns, &gpfn);
2459 set_xen_guest_handle(xatp.errs, &err);
2460
2461 rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
2462 if (rc < 0)
2463 return rc;
2464 return err;
2465}
2466
2467static int xlate_remove_from_p2m(unsigned long spfn, int count)
2468{
2469 struct xen_remove_from_physmap xrp;
2470 int i, rc;
2471
2472 for (i = 0; i < count; i++) {
2473 xrp.domid = DOMID_SELF;
2474 xrp.gpfn = spfn+i;
2475 rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
2476 if (rc)
2477 break;
2478 }
2479 return rc;
2480}
2481
2482struct xlate_remap_data {
2483 unsigned long fgfn; /* foreign domain's gfn */
2484 pgprot_t prot;
2485 domid_t domid;
2486 int index;
2487 struct page **pages;
2488};
2489
2490static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
2491 void *data)
2492{
2493 int rc;
2494 struct xlate_remap_data *remap = data;
2495 unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
2496 pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
2497
2498 rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
2499 if (rc)
2500 return rc;
2501 native_set_pte(ptep, pteval);
2502
2503 return 0;
2504}
2505
2506static int xlate_remap_gfn_range(struct vm_area_struct *vma,
2507 unsigned long addr, unsigned long mfn,
2508 int nr, pgprot_t prot, unsigned domid,
2509 struct page **pages)
2510{
2511 int err;
2512 struct xlate_remap_data pvhdata;
2513
2514 BUG_ON(!pages);
2515
2516 pvhdata.fgfn = mfn;
2517 pvhdata.prot = prot;
2518 pvhdata.domid = domid;
2519 pvhdata.index = 0;
2520 pvhdata.pages = pages;
2521 err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
2522 xlate_map_pte_fn, &pvhdata);
2523 flush_tlb_all();
2524 return err;
2525}
2526#endif
2527
2528#define REMAP_BATCH_SIZE 16 2439#define REMAP_BATCH_SIZE 16
2529 2440
2530struct remap_data { 2441struct remap_data {
2531 unsigned long mfn; 2442 xen_pfn_t *mfn;
2443 bool contiguous;
2532 pgprot_t prot; 2444 pgprot_t prot;
2533 struct mmu_update *mmu_update; 2445 struct mmu_update *mmu_update;
2534}; 2446};
@@ -2537,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2537 unsigned long addr, void *data) 2449 unsigned long addr, void *data)
2538{ 2450{
2539 struct remap_data *rmd = data; 2451 struct remap_data *rmd = data;
2540 pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); 2452 pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
2453
2454 /* If we have a contigious range, just update the mfn itself,
2455 else update pointer to be "next mfn". */
2456 if (rmd->contiguous)
2457 (*rmd->mfn)++;
2458 else
2459 rmd->mfn++;
2541 2460
2542 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; 2461 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2543 rmd->mmu_update->val = pte_val_ma(pte); 2462 rmd->mmu_update->val = pte_val_ma(pte);
@@ -2546,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2546 return 0; 2465 return 0;
2547} 2466}
2548 2467
2549int xen_remap_domain_mfn_range(struct vm_area_struct *vma, 2468static int do_remap_mfn(struct vm_area_struct *vma,
2550 unsigned long addr, 2469 unsigned long addr,
2551 xen_pfn_t mfn, int nr, 2470 xen_pfn_t *mfn, int nr,
2552 pgprot_t prot, unsigned domid, 2471 int *err_ptr, pgprot_t prot,
2553 struct page **pages) 2472 unsigned domid,
2554 2473 struct page **pages)
2555{ 2474{
2475 int err = 0;
2556 struct remap_data rmd; 2476 struct remap_data rmd;
2557 struct mmu_update mmu_update[REMAP_BATCH_SIZE]; 2477 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2558 int batch;
2559 unsigned long range; 2478 unsigned long range;
2560 int err = 0; 2479 int mapped = 0;
2561 2480
2562 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); 2481 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2563 2482
2564 if (xen_feature(XENFEAT_auto_translated_physmap)) { 2483 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2565#ifdef CONFIG_XEN_PVH 2484#ifdef CONFIG_XEN_PVH
2566 /* We need to update the local page tables and the xen HAP */ 2485 /* We need to update the local page tables and the xen HAP */
2567 return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, 2486 return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
2568 domid, pages); 2487 prot, domid, pages);
2569#else 2488#else
2570 return -EINVAL; 2489 return -EINVAL;
2571#endif 2490#endif
@@ -2573,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2573 2492
2574 rmd.mfn = mfn; 2493 rmd.mfn = mfn;
2575 rmd.prot = prot; 2494 rmd.prot = prot;
2495 /* We use the err_ptr to indicate if there we are doing a contigious
2496 * mapping or a discontigious mapping. */
2497 rmd.contiguous = !err_ptr;
2576 2498
2577 while (nr) { 2499 while (nr) {
2578 batch = min(REMAP_BATCH_SIZE, nr); 2500 int index = 0;
2501 int done = 0;
2502 int batch = min(REMAP_BATCH_SIZE, nr);
2503 int batch_left = batch;
2579 range = (unsigned long)batch << PAGE_SHIFT; 2504 range = (unsigned long)batch << PAGE_SHIFT;
2580 2505
2581 rmd.mmu_update = mmu_update; 2506 rmd.mmu_update = mmu_update;
@@ -2584,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2584 if (err) 2509 if (err)
2585 goto out; 2510 goto out;
2586 2511
2587 err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); 2512 /* We record the error for each page that gives an error, but
2588 if (err < 0) 2513 * continue mapping until the whole set is done */
2589 goto out; 2514 do {
2515 int i;
2516
2517 err = HYPERVISOR_mmu_update(&mmu_update[index],
2518 batch_left, &done, domid);
2519
2520 /*
2521 * @err_ptr may be the same buffer as @mfn, so
2522 * only clear it after each chunk of @mfn is
2523 * used.
2524 */
2525 if (err_ptr) {
2526 for (i = index; i < index + done; i++)
2527 err_ptr[i] = 0;
2528 }
2529 if (err < 0) {
2530 if (!err_ptr)
2531 goto out;
2532 err_ptr[i] = err;
2533 done++; /* Skip failed frame. */
2534 } else
2535 mapped += done;
2536 batch_left -= done;
2537 index += done;
2538 } while (batch_left);
2590 2539
2591 nr -= batch; 2540 nr -= batch;
2592 addr += range; 2541 addr += range;
2542 if (err_ptr)
2543 err_ptr += batch;
2593 } 2544 }
2594
2595 err = 0;
2596out: 2545out:
2597 2546
2598 xen_flush_tlb_all(); 2547 xen_flush_tlb_all();
2599 2548
2600 return err; 2549 return err < 0 ? err : mapped;
2550}
2551
2552int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2553 unsigned long addr,
2554 xen_pfn_t mfn, int nr,
2555 pgprot_t prot, unsigned domid,
2556 struct page **pages)
2557{
2558 return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
2601} 2559}
2602EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); 2560EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2603 2561
2562int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
2563 unsigned long addr,
2564 xen_pfn_t *mfn, int nr,
2565 int *err_ptr, pgprot_t prot,
2566 unsigned domid, struct page **pages)
2567{
2568 /* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
2569 * and the consequences later is quite hard to detect what the actual
2570 * cause of "wrong memory was mapped in".
2571 */
2572 BUG_ON(err_ptr == NULL);
2573 return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
2574}
2575EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
2576
2577
2604/* Returns: 0 success */ 2578/* Returns: 0 success */
2605int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, 2579int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2606 int numpgs, struct page **pages) 2580 int numpgs, struct page **pages)
@@ -2609,22 +2583,7 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2609 return 0; 2583 return 0;
2610 2584
2611#ifdef CONFIG_XEN_PVH 2585#ifdef CONFIG_XEN_PVH
2612 while (numpgs--) { 2586 return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
2613 /*
2614 * The mmu has already cleaned up the process mmu
2615 * resources at this point (lookup_address will return
2616 * NULL).
2617 */
2618 unsigned long pfn = page_to_pfn(pages[numpgs]);
2619
2620 xlate_remove_from_p2m(pfn, 1);
2621 }
2622 /*
2623 * We don't need to flush tlbs because as part of
2624 * xlate_remove_from_p2m, the hypervisor will do tlb flushes
2625 * after removing the p2m entries from the EPT/NPT
2626 */
2627 return 0;
2628#else 2587#else
2629 return -EINVAL; 2588 return -EINVAL;
2630#endif 2589#endif
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 9f93af56a5fc..b47124d4cd67 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -91,6 +91,12 @@ EXPORT_SYMBOL_GPL(xen_p2m_size);
91unsigned long xen_max_p2m_pfn __read_mostly; 91unsigned long xen_max_p2m_pfn __read_mostly;
92EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); 92EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
93 93
94#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
95#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
96#else
97#define P2M_LIMIT 0
98#endif
99
94static DEFINE_SPINLOCK(p2m_update_lock); 100static DEFINE_SPINLOCK(p2m_update_lock);
95 101
96static unsigned long *p2m_mid_missing_mfn; 102static unsigned long *p2m_mid_missing_mfn;
@@ -385,9 +391,11 @@ static void __init xen_rebuild_p2m_list(unsigned long *p2m)
385void __init xen_vmalloc_p2m_tree(void) 391void __init xen_vmalloc_p2m_tree(void)
386{ 392{
387 static struct vm_struct vm; 393 static struct vm_struct vm;
394 unsigned long p2m_limit;
388 395
396 p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
389 vm.flags = VM_ALLOC; 397 vm.flags = VM_ALLOC;
390 vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn, 398 vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
391 PMD_SIZE * PMDS_PER_MID_PAGE); 399 PMD_SIZE * PMDS_PER_MID_PAGE);
392 vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE); 400 vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
393 pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size); 401 pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..86484384492e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
90 90
91 set_cpu_online(cpu, true); 91 set_cpu_online(cpu, true);
92 92
93 this_cpu_write(cpu_state, CPU_ONLINE); 93 cpu_set_state_online(cpu); /* Implies full memory barrier. */
94
95 wmb();
96 94
97 /* We can take interrupts now: we're officially "up". */ 95 /* We can take interrupts now: we're officially "up". */
98 local_irq_enable(); 96 local_irq_enable();
99
100 wmb(); /* make sure everything is out */
101} 97}
102 98
103/* 99/*
@@ -445,21 +441,19 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
445{ 441{
446 int rc; 442 int rc;
447 443
448 per_cpu(current_task, cpu) = idle; 444 common_cpu_up(cpu, idle);
449#ifdef CONFIG_X86_32
450 irq_ctx_init(cpu);
451#else
452 clear_tsk_thread_flag(idle, TIF_FORK);
453#endif
454 per_cpu(kernel_stack, cpu) =
455 (unsigned long)task_stack_page(idle) -
456 KERNEL_STACK_OFFSET + THREAD_SIZE;
457 445
458 xen_setup_runstate_info(cpu); 446 xen_setup_runstate_info(cpu);
459 xen_setup_timer(cpu); 447 xen_setup_timer(cpu);
460 xen_init_lock_cpu(cpu); 448 xen_init_lock_cpu(cpu);
461 449
462 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 450 /*
451 * PV VCPUs are always successfully taken down (see 'while' loop
452 * in xen_cpu_die()), so -EBUSY is an error.
453 */
454 rc = cpu_check_up_prepare(cpu);
455 if (rc)
456 return rc;
463 457
464 /* make sure interrupts start blocked */ 458 /* make sure interrupts start blocked */
465 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 459 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -468,10 +462,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
468 if (rc) 462 if (rc)
469 return rc; 463 return rc;
470 464
471 if (num_online_cpus() == 1)
472 /* Just in case we booted with a single CPU. */
473 alternatives_enable_smp();
474
475 rc = xen_smp_intr_init(cpu); 465 rc = xen_smp_intr_init(cpu);
476 if (rc) 466 if (rc)
477 return rc; 467 return rc;
@@ -479,10 +469,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
479 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 469 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
480 BUG_ON(rc); 470 BUG_ON(rc);
481 471
482 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { 472 while (cpu_report_state(cpu) != CPU_ONLINE)
483 HYPERVISOR_sched_op(SCHEDOP_yield, NULL); 473 HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
484 barrier();
485 }
486 474
487 return 0; 475 return 0;
488} 476}
@@ -511,11 +499,11 @@ static void xen_cpu_die(unsigned int cpu)
511 schedule_timeout(HZ/10); 499 schedule_timeout(HZ/10);
512 } 500 }
513 501
514 cpu_die_common(cpu); 502 if (common_cpu_die(cpu) == 0) {
515 503 xen_smp_intr_free(cpu);
516 xen_smp_intr_free(cpu); 504 xen_uninit_lock_cpu(cpu);
517 xen_uninit_lock_cpu(cpu); 505 xen_teardown_timer(cpu);
518 xen_teardown_timer(cpu); 506 }
519} 507}
520 508
521static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ 509static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -747,6 +735,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
747static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) 735static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
748{ 736{
749 int rc; 737 int rc;
738
739 /*
740 * This can happen if CPU was offlined earlier and
741 * offlining timed out in common_cpu_die().
742 */
743 if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
744 xen_smp_intr_free(cpu);
745 xen_uninit_lock_cpu(cpu);
746 }
747
750 /* 748 /*
751 * xen_smp_intr_init() needs to run before native_cpu_up() 749 * xen_smp_intr_init() needs to run before native_cpu_up()
752 * so that IPI vectors are set up on the booting CPU before 750 * so that IPI vectors are set up on the booting CPU before
@@ -768,12 +766,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
768 return rc; 766 return rc;
769} 767}
770 768
771static void xen_hvm_cpu_die(unsigned int cpu)
772{
773 xen_cpu_die(cpu);
774 native_cpu_die(cpu);
775}
776
777void __init xen_hvm_smp_init(void) 769void __init xen_hvm_smp_init(void)
778{ 770{
779 if (!xen_have_vector_callback) 771 if (!xen_have_vector_callback)
@@ -781,7 +773,7 @@ void __init xen_hvm_smp_init(void)
781 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; 773 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
782 smp_ops.smp_send_reschedule = xen_smp_send_reschedule; 774 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
783 smp_ops.cpu_up = xen_hvm_cpu_up; 775 smp_ops.cpu_up = xen_hvm_cpu_up;
784 smp_ops.cpu_die = xen_hvm_cpu_die; 776 smp_ops.cpu_die = xen_cpu_die;
785 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; 777 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
786 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; 778 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
787 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; 779 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index c4df9dbd63b7..d9497698645a 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,5 +1,5 @@
1#include <linux/types.h> 1#include <linux/types.h>
2#include <linux/clockchips.h> 2#include <linux/tick.h>
3 3
4#include <xen/interface/xen.h> 4#include <xen/interface/xen.h>
5#include <xen/grant_table.h> 5#include <xen/grant_table.h>
@@ -81,17 +81,14 @@ void xen_arch_post_suspend(int cancelled)
81 81
82static void xen_vcpu_notify_restore(void *data) 82static void xen_vcpu_notify_restore(void *data)
83{ 83{
84 unsigned long reason = (unsigned long)data;
85
86 /* Boot processor notified via generic timekeeping_resume() */ 84 /* Boot processor notified via generic timekeeping_resume() */
87 if ( smp_processor_id() == 0) 85 if (smp_processor_id() == 0)
88 return; 86 return;
89 87
90 clockevents_notify(reason, NULL); 88 tick_resume_local();
91} 89}
92 90
93void xen_arch_resume(void) 91void xen_arch_resume(void)
94{ 92{
95 on_each_cpu(xen_vcpu_notify_restore, 93 on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
96 (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
97} 94}
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
index 520022d1a181..a702ec2f5931 100644
--- a/arch/x86/xen/trace.c
+++ b/arch/x86/xen/trace.c
@@ -1,54 +1,12 @@
1#include <linux/ftrace.h> 1#include <linux/ftrace.h>
2#include <xen/interface/xen.h> 2#include <xen/interface/xen.h>
3#include <xen/interface/xen-mca.h>
3 4
4#define N(x) [__HYPERVISOR_##x] = "("#x")" 5#define HYPERCALL(x) [__HYPERVISOR_##x] = "("#x")",
5static const char *xen_hypercall_names[] = { 6static const char *xen_hypercall_names[] = {
6 N(set_trap_table), 7#include <asm/xen-hypercalls.h>
7 N(mmu_update),
8 N(set_gdt),
9 N(stack_switch),
10 N(set_callbacks),
11 N(fpu_taskswitch),
12 N(sched_op_compat),
13 N(dom0_op),
14 N(set_debugreg),
15 N(get_debugreg),
16 N(update_descriptor),
17 N(memory_op),
18 N(multicall),
19 N(update_va_mapping),
20 N(set_timer_op),
21 N(event_channel_op_compat),
22 N(xen_version),
23 N(console_io),
24 N(physdev_op_compat),
25 N(grant_table_op),
26 N(vm_assist),
27 N(update_va_mapping_otherdomain),
28 N(iret),
29 N(vcpu_op),
30 N(set_segment_base),
31 N(mmuext_op),
32 N(acm_op),
33 N(nmi_op),
34 N(sched_op),
35 N(callback_op),
36 N(xenoprof_op),
37 N(event_channel_op),
38 N(physdev_op),
39 N(hvm_op),
40
41/* Architecture-specific hypercall definitions. */
42 N(arch_0),
43 N(arch_1),
44 N(arch_2),
45 N(arch_3),
46 N(arch_4),
47 N(arch_5),
48 N(arch_6),
49 N(arch_7),
50}; 8};
51#undef N 9#undef HYPERCALL
52 10
53static const char *xen_hypercall_name(unsigned op) 11static const char *xen_hypercall_name(unsigned op)
54{ 12{
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefda4275..985fc3ee0973 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64)
68 * We're already on the usermode stack at this point, but 68 * We're already on the usermode stack at this point, but
69 * still with the kernel gs, so we can easily switch back 69 * still with the kernel gs, so we can easily switch back
70 */ 70 */
71 movq %rsp, PER_CPU_VAR(old_rsp) 71 movq %rsp, PER_CPU_VAR(rsp_scratch)
72 movq PER_CPU_VAR(kernel_stack), %rsp 72 movq PER_CPU_VAR(kernel_stack), %rsp
73 73
74 pushq $__USER_DS 74 pushq $__USER_DS
75 pushq PER_CPU_VAR(old_rsp) 75 pushq PER_CPU_VAR(rsp_scratch)
76 pushq %r11 76 pushq %r11
77 pushq $__USER_CS 77 pushq $__USER_CS
78 pushq %rcx 78 pushq %rcx
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32)
87 * We're already on the usermode stack at this point, but 87 * We're already on the usermode stack at this point, but
88 * still with the kernel gs, so we can easily switch back 88 * still with the kernel gs, so we can easily switch back
89 */ 89 */
90 movq %rsp, PER_CPU_VAR(old_rsp) 90 movq %rsp, PER_CPU_VAR(rsp_scratch)
91 movq PER_CPU_VAR(kernel_stack), %rsp 91 movq PER_CPU_VAR(kernel_stack), %rsp
92 92
93 pushq $__USER32_DS 93 pushq $__USER32_DS
94 pushq PER_CPU_VAR(old_rsp) 94 pushq PER_CPU_VAR(rsp_scratch)
95 pushq %r11 95 pushq %r11
96 pushq $__USER32_CS 96 pushq $__USER32_CS
97 pushq %rcx 97 pushq %rcx
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 674b222544b7..8afdfccf6086 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -12,6 +12,8 @@
12 12
13#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <xen/interface/features.h> 14#include <xen/interface/features.h>
15#include <xen/interface/xen.h>
16#include <xen/interface/xen-mca.h>
15#include <asm/xen/interface.h> 17#include <asm/xen/interface.h>
16 18
17#ifdef CONFIG_XEN_PVH 19#ifdef CONFIG_XEN_PVH
@@ -85,59 +87,14 @@ ENTRY(xen_pvh_early_cpu_init)
85.pushsection .text 87.pushsection .text
86 .balign PAGE_SIZE 88 .balign PAGE_SIZE
87ENTRY(hypercall_page) 89ENTRY(hypercall_page)
88#define NEXT_HYPERCALL(x) \ 90 .skip PAGE_SIZE
89 ENTRY(xen_hypercall_##x) \ 91
90 .skip 32 92#define HYPERCALL(n) \
91 93 .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
92NEXT_HYPERCALL(set_trap_table) 94 .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
93NEXT_HYPERCALL(mmu_update) 95#include <asm/xen-hypercalls.h>
94NEXT_HYPERCALL(set_gdt) 96#undef HYPERCALL
95NEXT_HYPERCALL(stack_switch) 97
96NEXT_HYPERCALL(set_callbacks)
97NEXT_HYPERCALL(fpu_taskswitch)
98NEXT_HYPERCALL(sched_op_compat)
99NEXT_HYPERCALL(platform_op)
100NEXT_HYPERCALL(set_debugreg)
101NEXT_HYPERCALL(get_debugreg)
102NEXT_HYPERCALL(update_descriptor)
103NEXT_HYPERCALL(ni)
104NEXT_HYPERCALL(memory_op)
105NEXT_HYPERCALL(multicall)
106NEXT_HYPERCALL(update_va_mapping)
107NEXT_HYPERCALL(set_timer_op)
108NEXT_HYPERCALL(event_channel_op_compat)
109NEXT_HYPERCALL(xen_version)
110NEXT_HYPERCALL(console_io)
111NEXT_HYPERCALL(physdev_op_compat)
112NEXT_HYPERCALL(grant_table_op)
113NEXT_HYPERCALL(vm_assist)
114NEXT_HYPERCALL(update_va_mapping_otherdomain)
115NEXT_HYPERCALL(iret)
116NEXT_HYPERCALL(vcpu_op)
117NEXT_HYPERCALL(set_segment_base)
118NEXT_HYPERCALL(mmuext_op)
119NEXT_HYPERCALL(xsm_op)
120NEXT_HYPERCALL(nmi_op)
121NEXT_HYPERCALL(sched_op)
122NEXT_HYPERCALL(callback_op)
123NEXT_HYPERCALL(xenoprof_op)
124NEXT_HYPERCALL(event_channel_op)
125NEXT_HYPERCALL(physdev_op)
126NEXT_HYPERCALL(hvm_op)
127NEXT_HYPERCALL(sysctl)
128NEXT_HYPERCALL(domctl)
129NEXT_HYPERCALL(kexec_op)
130NEXT_HYPERCALL(tmem_op) /* 38 */
131ENTRY(xen_hypercall_rsvr)
132 .skip 320
133NEXT_HYPERCALL(mca) /* 48 */
134NEXT_HYPERCALL(arch_1)
135NEXT_HYPERCALL(arch_2)
136NEXT_HYPERCALL(arch_3)
137NEXT_HYPERCALL(arch_4)
138NEXT_HYPERCALL(arch_5)
139NEXT_HYPERCALL(arch_6)
140 .balign PAGE_SIZE
141.popsection 98.popsection
142 99
143 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 100 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")