aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig129
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/bioscall.S6
-rw-r--r--arch/x86/boot/boot.h10
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/aslr.c316
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/cpuflags.c12
-rw-r--r--arch/x86/boot/compressed/head_32.S10
-rw-r--r--arch/x86/boot/compressed/head_64.S16
-rw-r--r--arch/x86/boot/compressed/misc.c18
-rw-r--r--arch/x86/boot/compressed/misc.h37
-rw-r--r--arch/x86/boot/copy.S22
-rw-r--r--arch/x86/boot/cpucheck.c100
-rw-r--r--arch/x86/boot/cpuflags.c104
-rw-r--r--arch/x86/boot/cpuflags.h19
-rw-r--r--arch/x86/boot/header.S9
-rw-r--r--arch/x86/crypto/Makefile1
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2811
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c147
-rw-r--r--arch/x86/include/asm/archrandom.h21
-rw-r--r--arch/x86/include/asm/barrier.h43
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/dmi.h6
-rw-r--r--arch/x86/include/asm/efi.h78
-rw-r--r--arch/x86/include/asm/fixmap.h59
-rw-r--r--arch/x86/include/asm/fpu-internal.h13
-rw-r--r--arch/x86/include/asm/futex.h21
-rw-r--r--arch/x86/include/asm/hw_irq.h3
-rw-r--r--arch/x86/include/asm/intel-mid.h48
-rw-r--r--arch/x86/include/asm/iosf_mbi.h90
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/microcode.h15
-rw-r--r--arch/x86/include/asm/microcode_amd.h7
-rw-r--r--arch/x86/include/asm/mpspec.h1
-rw-r--r--arch/x86/include/asm/mwait.h43
-rw-r--r--arch/x86/include/asm/page.h1
-rw-r--r--arch/x86/include/asm/page_32.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h15
-rw-r--r--arch/x86/include/asm/page_types.h4
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/pgtable-2level.h100
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/processor.h42
-rw-r--r--arch/x86/include/asm/ptrace.h1
-rw-r--r--arch/x86/include/asm/setup.h3
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/timer.h78
-rw-r--r--arch/x86/include/asm/tsc.h3
-rw-r--r--arch/x86/include/asm/uaccess.h124
-rw-r--r--arch/x86/include/asm/uaccess_64.h4
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/include/asm/xsave.h14
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h2
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h13
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h2
-rw-r--r--arch/x86/include/uapi/asm/stat.h42
-rw-r--r--arch/x86/kernel/Makefile13
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/acpi/cstate.c23
-rw-r--r--arch/x86/kernel/apic/apic.c66
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c1
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c20
-rw-r--r--arch/x86/kernel/apic/ipi.c1
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/check.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/centaur.c1
-rw-r--r--arch/x86/kernel/cpu/common.c7
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c29
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile7
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c (renamed from arch/x86/kernel/microcode_amd.c)15
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c (renamed from arch/x86/kernel/microcode_amd_early.c)239
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c (renamed from arch/x86/kernel/microcode_core.c)0
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c (renamed from arch/x86/kernel/microcode_core_early.c)0
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c (renamed from arch/x86/kernel/microcode_intel.c)2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c (renamed from arch/x86/kernel/microcode_intel_early.c)10
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c (renamed from arch/x86/kernel/microcode_intel_lib.c)0
-rw-r--r--arch/x86/kernel/cpu/perf_event.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c53
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c679
-rw-r--r--arch/x86/kernel/cpu/rdrand.c14
-rw-r--r--arch/x86/kernel/cpu/transmeta.c1
-rw-r--r--arch/x86/kernel/cpu/umc.c1
-rw-r--r--arch/x86/kernel/crash.c1
-rw-r--r--arch/x86/kernel/doublefault.c1
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/entry_32.S4
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c1
-rw-r--r--arch/x86/kernel/iosf_mbi.c226
-rw-r--r--arch/x86/kernel/irq.c89
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/ksysfs.c340
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/pci-nommu.c1
-rw-r--r--arch/x86/kernel/process_32.c1
-rw-r--r--arch/x86/kernel/setup.c54
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/traps.c22
-rw-r--r--arch/x86/kernel/tsc.c328
-rw-r--r--arch/x86/kernel/tsc_msr.c127
-rw-r--r--arch/x86/kernel/tsc_sync.c1
-rw-r--r--arch/x86/kernel/x86_init.c4
-rw-r--r--arch/x86/kernel/xsave.c10
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/i8254.c18
-rw-r--r--arch/x86/kvm/lapic.c11
-rw-r--r--arch/x86/kvm/mmu.c12
-rw-r--r--arch/x86/kvm/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/svm.c15
-rw-r--r--arch/x86/kvm/vmx.c323
-rw-r--r--arch/x86/kvm/x86.c101
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/lib/copy_user_64.S12
-rw-r--r--arch/x86/lib/delay.c1
-rw-r--r--arch/x86/lib/x86-opcode-map.txt4
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/mm/gup.c8
-rw-r--r--arch/x86/mm/hugetlbpage.c9
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/kmmio.c1
-rw-r--r--arch/x86/mm/memtest.c2
-rw-r--r--arch/x86/mm/numa.c62
-rw-r--r--arch/x86/mm/pageattr-test.c1
-rw-r--r--arch/x86/mm/pageattr.c461
-rw-r--r--arch/x86/mm/srat.c5
-rw-r--r--arch/x86/net/bpf_jit_comp.c14
-rw-r--r--arch/x86/pci/fixup.c1
-rw-r--r--arch/x86/pci/intel_mid_pci.c6
-rw-r--r--arch/x86/pci/xen.c2
-rw-r--r--arch/x86/platform/efi/efi.c355
-rw-r--r--arch/x86/platform/efi/efi_32.c12
-rw-r--r--arch/x86/platform/efi/efi_64.c120
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S54
-rw-r--r--arch/x86/platform/intel-mid/Makefile4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_emc1403.c4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_lis331.c4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_max7315.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tca6416.c4
-rw-r--r--arch/x86/platform/intel-mid/early_printk_intel_mid.c1
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c64
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_weak_decls.h19
-rw-r--r--arch/x86/platform/intel-mid/mfld.c75
-rw-r--r--arch/x86/platform/intel-mid/mrfl.c103
-rw-r--r--arch/x86/platform/intel-mid/sfi.c46
-rw-r--r--arch/x86/platform/iris/iris.c1
-rw-r--r--arch/x86/platform/uv/tlb_uv.c66
-rw-r--r--arch/x86/realmode/init.c26
-rw-r--r--arch/x86/realmode/rm/reboot.S1
-rw-r--r--arch/x86/realmode/rm/trampoline_32.S1
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S1
-rw-r--r--arch/x86/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/tools/relocs.c20
-rw-r--r--arch/x86/vdso/vclock_gettime.c8
-rw-r--r--arch/x86/vdso/vdso.S1
-rw-r--r--arch/x86/vdso/vdsox32.S1
-rw-r--r--arch/x86/xen/Kconfig4
-rw-r--r--arch/x86/xen/enlighten.c126
-rw-r--r--arch/x86/xen/grant-table.c63
-rw-r--r--arch/x86/xen/irq.c5
-rw-r--r--arch/x86/xen/mmu.c166
-rw-r--r--arch/x86/xen/p2m.c15
-rw-r--r--arch/x86/xen/platform-pci-unplug.c79
-rw-r--r--arch/x86/xen/setup.c40
-rw-r--r--arch/x86/xen/smp.c49
-rw-r--r--arch/x86/xen/time.c1
-rw-r--r--arch/x86/xen/xen-head.S25
-rw-r--r--arch/x86/xen/xen-ops.h1
191 files changed, 8526 insertions, 1356 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0952ecd60eca..3e97a3dd4129 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -125,6 +125,7 @@ config X86
125 select RTC_LIB 125 select RTC_LIB
126 select HAVE_DEBUG_STACKOVERFLOW 126 select HAVE_DEBUG_STACKOVERFLOW
127 select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 127 select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
128 select HAVE_CC_STACKPROTECTOR
128 129
129config INSTRUCTION_DECODER 130config INSTRUCTION_DECODER
130 def_bool y 131 def_bool y
@@ -278,13 +279,13 @@ config SMP
278 bool "Symmetric multi-processing support" 279 bool "Symmetric multi-processing support"
279 ---help--- 280 ---help---
280 This enables support for systems with more than one CPU. If you have 281 This enables support for systems with more than one CPU. If you have
281 a system with only one CPU, like most personal computers, say N. If 282 a system with only one CPU, say N. If you have a system with more
282 you have a system with more than one CPU, say Y. 283 than one CPU, say Y.
283 284
284 If you say N here, the kernel will run on single and multiprocessor 285 If you say N here, the kernel will run on uni- and multiprocessor
285 machines, but will use only one CPU of a multiprocessor machine. If 286 machines, but will use only one CPU of a multiprocessor machine. If
286 you say Y here, the kernel will run on many, but not all, 287 you say Y here, the kernel will run on many, but not all,
287 singleprocessor machines. On a singleprocessor machine, the kernel 288 uniprocessor machines. On a uniprocessor machine, the kernel
288 will run faster if you say N here. 289 will run faster if you say N here.
289 290
290 Note that if you say Y here and choose architecture "586" or 291 Note that if you say Y here and choose architecture "586" or
@@ -438,42 +439,26 @@ config X86_INTEL_CE
438 This option compiles in support for the CE4100 SOC for settop 439 This option compiles in support for the CE4100 SOC for settop
439 boxes and media devices. 440 boxes and media devices.
440 441
441config X86_WANT_INTEL_MID 442config X86_INTEL_MID
442 bool "Intel MID platform support" 443 bool "Intel MID platform support"
443 depends on X86_32 444 depends on X86_32
444 depends on X86_EXTENDED_PLATFORM 445 depends on X86_EXTENDED_PLATFORM
445 ---help---
446 Select to build a kernel capable of supporting Intel MID platform
447 systems which do not have the PCI legacy interfaces (Moorestown,
448 Medfield). If you are building for a PC class system say N here.
449
450if X86_WANT_INTEL_MID
451
452config X86_INTEL_MID
453 bool
454
455config X86_MDFLD
456 bool "Medfield MID platform"
457 depends on PCI 446 depends on PCI
458 depends on PCI_GOANY 447 depends on PCI_GOANY
459 depends on X86_IO_APIC 448 depends on X86_IO_APIC
460 select X86_INTEL_MID
461 select SFI 449 select SFI
450 select I2C
462 select DW_APB_TIMER 451 select DW_APB_TIMER
463 select APB_TIMER 452 select APB_TIMER
464 select I2C
465 select SPI
466 select INTEL_SCU_IPC 453 select INTEL_SCU_IPC
467 select X86_PLATFORM_DEVICES
468 select MFD_INTEL_MSIC 454 select MFD_INTEL_MSIC
469 ---help--- 455 ---help---
470 Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin 456 Select to build a kernel capable of supporting Intel MID (Mobile
471 Internet Device(MID) platform. 457 Internet Device) platform systems which do not have the PCI legacy
472 Unlike standard x86 PCs, Medfield does not have many legacy devices 458 interfaces. If you are building for a PC class system say N here.
473 nor standard legacy replacement devices/features. e.g. Medfield does
474 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
475 459
476endif 460 Intel MID platforms are based on an Intel processor and chipset which
461 consume less power than most of the x86 derivatives.
477 462
478config X86_INTEL_LPSS 463config X86_INTEL_LPSS
479 bool "Intel Low Power Subsystem Support" 464 bool "Intel Low Power Subsystem Support"
@@ -746,6 +731,7 @@ config APB_TIMER
746# The code disables itself when not needed. 731# The code disables itself when not needed.
747config DMI 732config DMI
748 default y 733 default y
734 select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
749 bool "Enable DMI scanning" if EXPERT 735 bool "Enable DMI scanning" if EXPERT
750 ---help--- 736 ---help---
751 Enabled scanning of DMI to identify machine quirks. Say Y 737 Enabled scanning of DMI to identify machine quirks. Say Y
@@ -953,7 +939,7 @@ config X86_ANCIENT_MCE
953 depends on X86_32 && X86_MCE 939 depends on X86_32 && X86_MCE
954 ---help--- 940 ---help---
955 Include support for machine check handling on old Pentium 5 or WinChip 941 Include support for machine check handling on old Pentium 5 or WinChip
956 systems. These typically need to be enabled explicitely on the command 942 systems. These typically need to be enabled explicitly on the command
957 line. 943 line.
958 944
959config X86_MCE_THRESHOLD 945config X86_MCE_THRESHOLD
@@ -1080,10 +1066,6 @@ config MICROCODE_OLD_INTERFACE
1080 def_bool y 1066 def_bool y
1081 depends on MICROCODE 1067 depends on MICROCODE
1082 1068
1083config MICROCODE_INTEL_LIB
1084 def_bool y
1085 depends on MICROCODE_INTEL
1086
1087config MICROCODE_INTEL_EARLY 1069config MICROCODE_INTEL_EARLY
1088 def_bool n 1070 def_bool n
1089 1071
@@ -1617,22 +1599,6 @@ config SECCOMP
1617 1599
1618 If unsure, say Y. Only embedded should say N here. 1600 If unsure, say Y. Only embedded should say N here.
1619 1601
1620config CC_STACKPROTECTOR
1621 bool "Enable -fstack-protector buffer overflow detection"
1622 ---help---
1623 This option turns on the -fstack-protector GCC feature. This
1624 feature puts, at the beginning of functions, a canary value on
1625 the stack just before the return address, and validates
1626 the value just before actually returning. Stack based buffer
1627 overflows (that need to overwrite this return address) now also
1628 overwrite the canary, which gets detected and the attack is then
1629 neutralized via a kernel panic.
1630
1631 This feature requires gcc version 4.2 or above, or a distribution
1632 gcc with the feature backported. Older versions are automatically
1633 detected and for those versions, this configuration option is
1634 ignored. (and a warning is printed during bootup)
1635
1636source kernel/Kconfig.hz 1602source kernel/Kconfig.hz
1637 1603
1638config KEXEC 1604config KEXEC
@@ -1728,16 +1694,67 @@ config RELOCATABLE
1728 1694
1729 Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address 1695 Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
1730 it has been loaded at and the compile time physical address 1696 it has been loaded at and the compile time physical address
1731 (CONFIG_PHYSICAL_START) is ignored. 1697 (CONFIG_PHYSICAL_START) is used as the minimum location.
1732 1698
1733# Relocation on x86-32 needs some additional build support 1699config RANDOMIZE_BASE
1700 bool "Randomize the address of the kernel image"
1701 depends on RELOCATABLE
1702 depends on !HIBERNATION
1703 default n
1704 ---help---
1705 Randomizes the physical and virtual address at which the
1706 kernel image is decompressed, as a security feature that
1707 deters exploit attempts relying on knowledge of the location
1708 of kernel internals.
1709
1710 Entropy is generated using the RDRAND instruction if it is
1711 supported. If RDTSC is supported, it is used as well. If
1712 neither RDRAND nor RDTSC are supported, then randomness is
1713 read from the i8254 timer.
1714
1715 The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET,
1716 and aligned according to PHYSICAL_ALIGN. Since the kernel is
1717 built using 2GiB addressing, and PHYSICAL_ALGIN must be at a
1718 minimum of 2MiB, only 10 bits of entropy is theoretically
1719 possible. At best, due to page table layouts, 64-bit can use
1720 9 bits of entropy and 32-bit uses 8 bits.
1721
1722 If unsure, say N.
1723
1724config RANDOMIZE_BASE_MAX_OFFSET
1725 hex "Maximum kASLR offset allowed" if EXPERT
1726 depends on RANDOMIZE_BASE
1727 range 0x0 0x20000000 if X86_32
1728 default "0x20000000" if X86_32
1729 range 0x0 0x40000000 if X86_64
1730 default "0x40000000" if X86_64
1731 ---help---
1732 The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical
1733 memory is used to determine the maximal offset in bytes that will
1734 be applied to the kernel when kernel Address Space Layout
1735 Randomization (kASLR) is active. This must be a multiple of
1736 PHYSICAL_ALIGN.
1737
1738 On 32-bit this is limited to 512MiB by page table layouts. The
1739 default is 512MiB.
1740
1741 On 64-bit this is limited by how the kernel fixmap page table is
1742 positioned, so this cannot be larger than 1GiB currently. Without
1743 RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel
1744 and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the
1745 modules area will shrink to compensate, up to the current maximum
1746 1GiB to 1GiB split. The default is 1GiB.
1747
1748 If unsure, leave at the default value.
1749
1750# Relocation on x86 needs some additional build support
1734config X86_NEED_RELOCS 1751config X86_NEED_RELOCS
1735 def_bool y 1752 def_bool y
1736 depends on X86_32 && RELOCATABLE 1753 depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
1737 1754
1738config PHYSICAL_ALIGN 1755config PHYSICAL_ALIGN
1739 hex "Alignment value to which kernel should be aligned" 1756 hex "Alignment value to which kernel should be aligned"
1740 default "0x1000000" 1757 default "0x200000"
1741 range 0x2000 0x1000000 if X86_32 1758 range 0x2000 0x1000000 if X86_32
1742 range 0x200000 0x1000000 if X86_64 1759 range 0x200000 0x1000000 if X86_64
1743 ---help--- 1760 ---help---
@@ -2393,6 +2410,14 @@ config X86_DMA_REMAP
2393 bool 2410 bool
2394 depends on STA2X11 2411 depends on STA2X11
2395 2412
2413config IOSF_MBI
2414 bool
2415 depends on PCI
2416 ---help---
2417 To be selected by modules requiring access to the Intel OnChip System
2418 Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms
2419 enumerable by PCI.
2420
2396source "net/Kconfig" 2421source "net/Kconfig"
2397 2422
2398source "drivers/Kconfig" 2423source "drivers/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 57d021507120..13b22e0f681d 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -89,13 +89,11 @@ else
89 KBUILD_CFLAGS += -maccumulate-outgoing-args 89 KBUILD_CFLAGS += -maccumulate-outgoing-args
90endif 90endif
91 91
92# Make sure compiler does not have buggy stack-protector support.
92ifdef CONFIG_CC_STACKPROTECTOR 93ifdef CONFIG_CC_STACKPROTECTOR
93 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh 94 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
94 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) 95 ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
95 stackp-y := -fstack-protector 96 $(warning stack-protector enabled but compiler support broken)
96 KBUILD_CFLAGS += $(stackp-y)
97 else
98 $(warning stack protector enabled but no compiler support)
99 endif 97 endif
100endif 98endif
101 99
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index d9c11956fce0..de7066918005 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -20,7 +20,7 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
20targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 20targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
21subdir- := compressed 21subdir- := compressed
22 22
23setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o 23setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
24setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o 24setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
25setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o 25setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
26setup-y += video-mode.o version.o 26setup-y += video-mode.o version.o
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 1dfbf64e52a2..d401b4a262b0 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -1,6 +1,6 @@
1/* ----------------------------------------------------------------------- 1/* -----------------------------------------------------------------------
2 * 2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin 3 * Copyright 2009-2014 Intel Corporation; author H. Peter Anvin
4 * 4 *
5 * This file is part of the Linux kernel, and is made available under 5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your 6 * the terms of the GNU General Public License version 2 or (at your
@@ -13,8 +13,8 @@
13 * touching registers they shouldn't be. 13 * touching registers they shouldn't be.
14 */ 14 */
15 15
16 .code16gcc 16 .code16
17 .text 17 .section ".inittext","ax"
18 .globl intcall 18 .globl intcall
19 .type intcall, @function 19 .type intcall, @function
20intcall: 20intcall:
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index ef72baeff484..50f8c5e0f37e 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -26,9 +26,8 @@
26#include <asm/boot.h> 26#include <asm/boot.h>
27#include <asm/setup.h> 27#include <asm/setup.h>
28#include "bitops.h" 28#include "bitops.h"
29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h>
31#include "ctype.h" 29#include "ctype.h"
30#include "cpuflags.h"
32 31
33/* Useful macros */ 32/* Useful macros */
34#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -307,14 +306,7 @@ static inline int cmdline_find_option_bool(const char *option)
307 return __cmdline_find_option_bool(cmd_line_ptr, option); 306 return __cmdline_find_option_bool(cmd_line_ptr, option);
308} 307}
309 308
310
311/* cpu.c, cpucheck.c */ 309/* cpu.c, cpucheck.c */
312struct cpu_features {
313 int level; /* Family, or 64 for x86-64 */
314 int model;
315 u32 flags[NCAPINTS];
316};
317extern struct cpu_features cpu;
318int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 310int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
319int validate_cpu(void); 311int validate_cpu(void);
320 312
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index c8a6792e7842..0fcd9133790c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -28,7 +28,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include
28 28
29VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ 29VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
30 $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ 30 $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
31 $(obj)/piggy.o 31 $(obj)/piggy.o $(obj)/cpuflags.o $(obj)/aslr.o
32 32
33$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone 33$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
34 34
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
new file mode 100644
index 000000000000..90a21f430117
--- /dev/null
+++ b/arch/x86/boot/compressed/aslr.c
@@ -0,0 +1,316 @@
1#include "misc.h"
2
3#ifdef CONFIG_RANDOMIZE_BASE
4#include <asm/msr.h>
5#include <asm/archrandom.h>
6#include <asm/e820.h>
7
8#include <generated/compile.h>
9#include <linux/module.h>
10#include <linux/uts.h>
11#include <linux/utsname.h>
12#include <generated/utsrelease.h>
13
14/* Simplified build-specific string for starting entropy. */
15static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
16 LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
17
18#define I8254_PORT_CONTROL 0x43
19#define I8254_PORT_COUNTER0 0x40
20#define I8254_CMD_READBACK 0xC0
21#define I8254_SELECT_COUNTER0 0x02
22#define I8254_STATUS_NOTREADY 0x40
23static inline u16 i8254(void)
24{
25 u16 status, timer;
26
27 do {
28 outb(I8254_PORT_CONTROL,
29 I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
30 status = inb(I8254_PORT_COUNTER0);
31 timer = inb(I8254_PORT_COUNTER0);
32 timer |= inb(I8254_PORT_COUNTER0) << 8;
33 } while (status & I8254_STATUS_NOTREADY);
34
35 return timer;
36}
37
38static unsigned long rotate_xor(unsigned long hash, const void *area,
39 size_t size)
40{
41 size_t i;
42 unsigned long *ptr = (unsigned long *)area;
43
44 for (i = 0; i < size / sizeof(hash); i++) {
45 /* Rotate by odd number of bits and XOR. */
46 hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
47 hash ^= ptr[i];
48 }
49
50 return hash;
51}
52
53/* Attempt to create a simple but unpredictable starting entropy. */
54static unsigned long get_random_boot(void)
55{
56 unsigned long hash = 0;
57
58 hash = rotate_xor(hash, build_str, sizeof(build_str));
59 hash = rotate_xor(hash, real_mode, sizeof(*real_mode));
60
61 return hash;
62}
63
64static unsigned long get_random_long(void)
65{
66#ifdef CONFIG_X86_64
67 const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
68#else
69 const unsigned long mix_const = 0x3f39e593UL;
70#endif
71 unsigned long raw, random = get_random_boot();
72 bool use_i8254 = true;
73
74 debug_putstr("KASLR using");
75
76 if (has_cpuflag(X86_FEATURE_RDRAND)) {
77 debug_putstr(" RDRAND");
78 if (rdrand_long(&raw)) {
79 random ^= raw;
80 use_i8254 = false;
81 }
82 }
83
84 if (has_cpuflag(X86_FEATURE_TSC)) {
85 debug_putstr(" RDTSC");
86 rdtscll(raw);
87
88 random ^= raw;
89 use_i8254 = false;
90 }
91
92 if (use_i8254) {
93 debug_putstr(" i8254");
94 random ^= i8254();
95 }
96
97 /* Circular multiply for better bit diffusion */
98 asm("mul %3"
99 : "=a" (random), "=d" (raw)
100 : "a" (random), "rm" (mix_const));
101 random += raw;
102
103 debug_putstr("...\n");
104
105 return random;
106}
107
108struct mem_vector {
109 unsigned long start;
110 unsigned long size;
111};
112
113#define MEM_AVOID_MAX 5
114struct mem_vector mem_avoid[MEM_AVOID_MAX];
115
116static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
117{
118 /* Item at least partially before region. */
119 if (item->start < region->start)
120 return false;
121 /* Item at least partially after region. */
122 if (item->start + item->size > region->start + region->size)
123 return false;
124 return true;
125}
126
127static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
128{
129 /* Item one is entirely before item two. */
130 if (one->start + one->size <= two->start)
131 return false;
132 /* Item one is entirely after item two. */
133 if (one->start >= two->start + two->size)
134 return false;
135 return true;
136}
137
138static void mem_avoid_init(unsigned long input, unsigned long input_size,
139 unsigned long output, unsigned long output_size)
140{
141 u64 initrd_start, initrd_size;
142 u64 cmd_line, cmd_line_size;
143 unsigned long unsafe, unsafe_len;
144 char *ptr;
145
146 /*
147 * Avoid the region that is unsafe to overlap during
148 * decompression (see calculations at top of misc.c).
149 */
150 unsafe_len = (output_size >> 12) + 32768 + 18;
151 unsafe = (unsigned long)input + input_size - unsafe_len;
152 mem_avoid[0].start = unsafe;
153 mem_avoid[0].size = unsafe_len;
154
155 /* Avoid initrd. */
156 initrd_start = (u64)real_mode->ext_ramdisk_image << 32;
157 initrd_start |= real_mode->hdr.ramdisk_image;
158 initrd_size = (u64)real_mode->ext_ramdisk_size << 32;
159 initrd_size |= real_mode->hdr.ramdisk_size;
160 mem_avoid[1].start = initrd_start;
161 mem_avoid[1].size = initrd_size;
162
163 /* Avoid kernel command line. */
164 cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32;
165 cmd_line |= real_mode->hdr.cmd_line_ptr;
166 /* Calculate size of cmd_line. */
167 ptr = (char *)(unsigned long)cmd_line;
168 for (cmd_line_size = 0; ptr[cmd_line_size++]; )
169 ;
170 mem_avoid[2].start = cmd_line;
171 mem_avoid[2].size = cmd_line_size;
172
173 /* Avoid heap memory. */
174 mem_avoid[3].start = (unsigned long)free_mem_ptr;
175 mem_avoid[3].size = BOOT_HEAP_SIZE;
176
177 /* Avoid stack memory. */
178 mem_avoid[4].start = (unsigned long)free_mem_end_ptr;
179 mem_avoid[4].size = BOOT_STACK_SIZE;
180}
181
182/* Does this memory vector overlap a known avoided area? */
183bool mem_avoid_overlap(struct mem_vector *img)
184{
185 int i;
186
187 for (i = 0; i < MEM_AVOID_MAX; i++) {
188 if (mem_overlaps(img, &mem_avoid[i]))
189 return true;
190 }
191
192 return false;
193}
194
195unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / CONFIG_PHYSICAL_ALIGN];
196unsigned long slot_max = 0;
197
198static void slots_append(unsigned long addr)
199{
200 /* Overflowing the slots list should be impossible. */
201 if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
202 CONFIG_PHYSICAL_ALIGN)
203 return;
204
205 slots[slot_max++] = addr;
206}
207
208static unsigned long slots_fetch_random(void)
209{
210 /* Handle case of no slots stored. */
211 if (slot_max == 0)
212 return 0;
213
214 return slots[get_random_long() % slot_max];
215}
216
217static void process_e820_entry(struct e820entry *entry,
218 unsigned long minimum,
219 unsigned long image_size)
220{
221 struct mem_vector region, img;
222
223 /* Skip non-RAM entries. */
224 if (entry->type != E820_RAM)
225 return;
226
227 /* Ignore entries entirely above our maximum. */
228 if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
229 return;
230
231 /* Ignore entries entirely below our minimum. */
232 if (entry->addr + entry->size < minimum)
233 return;
234
235 region.start = entry->addr;
236 region.size = entry->size;
237
238 /* Potentially raise address to minimum location. */
239 if (region.start < minimum)
240 region.start = minimum;
241
242 /* Potentially raise address to meet alignment requirements. */
243 region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
244
245 /* Did we raise the address above the bounds of this e820 region? */
246 if (region.start > entry->addr + entry->size)
247 return;
248
249 /* Reduce size by any delta from the original address. */
250 region.size -= region.start - entry->addr;
251
252 /* Reduce maximum size to fit end of image within maximum limit. */
253 if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
254 region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start;
255
256 /* Walk each aligned slot and check for avoided areas. */
257 for (img.start = region.start, img.size = image_size ;
258 mem_contains(&region, &img) ;
259 img.start += CONFIG_PHYSICAL_ALIGN) {
260 if (mem_avoid_overlap(&img))
261 continue;
262 slots_append(img.start);
263 }
264}
265
266static unsigned long find_random_addr(unsigned long minimum,
267 unsigned long size)
268{
269 int i;
270 unsigned long addr;
271
272 /* Make sure minimum is aligned. */
273 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
274
275 /* Verify potential e820 positions, appending to slots list. */
276 for (i = 0; i < real_mode->e820_entries; i++) {
277 process_e820_entry(&real_mode->e820_map[i], minimum, size);
278 }
279
280 return slots_fetch_random();
281}
282
283unsigned char *choose_kernel_location(unsigned char *input,
284 unsigned long input_size,
285 unsigned char *output,
286 unsigned long output_size)
287{
288 unsigned long choice = (unsigned long)output;
289 unsigned long random;
290
291 if (cmdline_find_option_bool("nokaslr")) {
292 debug_putstr("KASLR disabled...\n");
293 goto out;
294 }
295
296 /* Record the various known unsafe memory ranges. */
297 mem_avoid_init((unsigned long)input, input_size,
298 (unsigned long)output, output_size);
299
300 /* Walk e820 and find a random address. */
301 random = find_random_addr(choice, output_size);
302 if (!random) {
303 debug_putstr("KASLR could not find suitable E820 region...\n");
304 goto out;
305 }
306
307 /* Always enforce the minimum. */
308 if (random < choice)
309 goto out;
310
311 choice = random;
312out:
313 return (unsigned char *)choice;
314}
315
316#endif /* CONFIG_RANDOMIZE_BASE */
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index bffd73b45b1f..b68e3033e6b9 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,6 +1,6 @@
1#include "misc.h" 1#include "misc.h"
2 2
3#ifdef CONFIG_EARLY_PRINTK 3#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
4 4
5static unsigned long fs; 5static unsigned long fs;
6static inline void set_fs(unsigned long seg) 6static inline void set_fs(unsigned long seg)
diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c
new file mode 100644
index 000000000000..aa313466118b
--- /dev/null
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -0,0 +1,12 @@
1#ifdef CONFIG_RANDOMIZE_BASE
2
3#include "../cpuflags.c"
4
5bool has_cpuflag(int flag)
6{
7 get_cpuflags();
8
9 return test_bit(flag, cpu.flags);
10}
11
12#endif
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 5d6f6891b188..9116aac232c7 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -117,9 +117,11 @@ preferred_addr:
117 addl %eax, %ebx 117 addl %eax, %ebx
118 notl %eax 118 notl %eax
119 andl %eax, %ebx 119 andl %eax, %ebx
120#else 120 cmpl $LOAD_PHYSICAL_ADDR, %ebx
121 movl $LOAD_PHYSICAL_ADDR, %ebx 121 jge 1f
122#endif 122#endif
123 movl $LOAD_PHYSICAL_ADDR, %ebx
1241:
123 125
124 /* Target address to relocate to for decompression */ 126 /* Target address to relocate to for decompression */
125 addl $z_extract_offset, %ebx 127 addl $z_extract_offset, %ebx
@@ -191,14 +193,14 @@ relocated:
191 leal boot_heap(%ebx), %eax 193 leal boot_heap(%ebx), %eax
192 pushl %eax /* heap area */ 194 pushl %eax /* heap area */
193 pushl %esi /* real mode pointer */ 195 pushl %esi /* real mode pointer */
194 call decompress_kernel 196 call decompress_kernel /* returns kernel location in %eax */
195 addl $24, %esp 197 addl $24, %esp
196 198
197/* 199/*
198 * Jump to the decompressed kernel. 200 * Jump to the decompressed kernel.
199 */ 201 */
200 xorl %ebx, %ebx 202 xorl %ebx, %ebx
201 jmp *%ebp 203 jmp *%eax
202 204
203/* 205/*
204 * Stack and heap for uncompression 206 * Stack and heap for uncompression
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index c337422b575d..c5c1ae0997e7 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -94,9 +94,11 @@ ENTRY(startup_32)
94 addl %eax, %ebx 94 addl %eax, %ebx
95 notl %eax 95 notl %eax
96 andl %eax, %ebx 96 andl %eax, %ebx
97#else 97 cmpl $LOAD_PHYSICAL_ADDR, %ebx
98 movl $LOAD_PHYSICAL_ADDR, %ebx 98 jge 1f
99#endif 99#endif
100 movl $LOAD_PHYSICAL_ADDR, %ebx
1011:
100 102
101 /* Target address to relocate to for decompression */ 103 /* Target address to relocate to for decompression */
102 addl $z_extract_offset, %ebx 104 addl $z_extract_offset, %ebx
@@ -269,9 +271,11 @@ preferred_addr:
269 addq %rax, %rbp 271 addq %rax, %rbp
270 notq %rax 272 notq %rax
271 andq %rax, %rbp 273 andq %rax, %rbp
272#else 274 cmpq $LOAD_PHYSICAL_ADDR, %rbp
273 movq $LOAD_PHYSICAL_ADDR, %rbp 275 jge 1f
274#endif 276#endif
277 movq $LOAD_PHYSICAL_ADDR, %rbp
2781:
275 279
276 /* Target address to relocate to for decompression */ 280 /* Target address to relocate to for decompression */
277 leaq z_extract_offset(%rbp), %rbx 281 leaq z_extract_offset(%rbp), %rbx
@@ -339,13 +343,13 @@ relocated:
339 movl $z_input_len, %ecx /* input_len */ 343 movl $z_input_len, %ecx /* input_len */
340 movq %rbp, %r8 /* output target address */ 344 movq %rbp, %r8 /* output target address */
341 movq $z_output_len, %r9 /* decompressed length */ 345 movq $z_output_len, %r9 /* decompressed length */
342 call decompress_kernel 346 call decompress_kernel /* returns kernel location in %rax */
343 popq %rsi 347 popq %rsi
344 348
345/* 349/*
346 * Jump to the decompressed kernel. 350 * Jump to the decompressed kernel.
347 */ 351 */
348 jmp *%rbp 352 jmp *%rax
349 353
350 .code32 354 .code32
351no_longmode: 355no_longmode:
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 434f077d2c4d..196eaf373a06 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -112,14 +112,8 @@ struct boot_params *real_mode; /* Pointer to real-mode data */
112void *memset(void *s, int c, size_t n); 112void *memset(void *s, int c, size_t n);
113void *memcpy(void *dest, const void *src, size_t n); 113void *memcpy(void *dest, const void *src, size_t n);
114 114
115#ifdef CONFIG_X86_64 115memptr free_mem_ptr;
116#define memptr long 116memptr free_mem_end_ptr;
117#else
118#define memptr unsigned
119#endif
120
121static memptr free_mem_ptr;
122static memptr free_mem_end_ptr;
123 117
124static char *vidmem; 118static char *vidmem;
125static int vidport; 119static int vidport;
@@ -395,7 +389,7 @@ static void parse_elf(void *output)
395 free(phdrs); 389 free(phdrs);
396} 390}
397 391
398asmlinkage void decompress_kernel(void *rmode, memptr heap, 392asmlinkage void *decompress_kernel(void *rmode, memptr heap,
399 unsigned char *input_data, 393 unsigned char *input_data,
400 unsigned long input_len, 394 unsigned long input_len,
401 unsigned char *output, 395 unsigned char *output,
@@ -422,6 +416,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
422 free_mem_ptr = heap; /* Heap */ 416 free_mem_ptr = heap; /* Heap */
423 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 417 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
424 418
419 output = choose_kernel_location(input_data, input_len,
420 output, output_len);
421
422 /* Validate memory location choices. */
425 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) 423 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
426 error("Destination address inappropriately aligned"); 424 error("Destination address inappropriately aligned");
427#ifdef CONFIG_X86_64 425#ifdef CONFIG_X86_64
@@ -441,5 +439,5 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
441 parse_elf(output); 439 parse_elf(output);
442 handle_relocations(output, output_len); 440 handle_relocations(output, output_len);
443 debug_putstr("done.\nBooting the kernel.\n"); 441 debug_putstr("done.\nBooting the kernel.\n");
444 return; 442 return output;
445} 443}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 674019d8e235..24e3e569a13c 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -23,7 +23,15 @@
23#define BOOT_BOOT_H 23#define BOOT_BOOT_H
24#include "../ctype.h" 24#include "../ctype.h"
25 25
26#ifdef CONFIG_X86_64
27#define memptr long
28#else
29#define memptr unsigned
30#endif
31
26/* misc.c */ 32/* misc.c */
33extern memptr free_mem_ptr;
34extern memptr free_mem_end_ptr;
27extern struct boot_params *real_mode; /* Pointer to real-mode data */ 35extern struct boot_params *real_mode; /* Pointer to real-mode data */
28void __putstr(const char *s); 36void __putstr(const char *s);
29#define error_putstr(__x) __putstr(__x) 37#define error_putstr(__x) __putstr(__x)
@@ -39,23 +47,40 @@ static inline void debug_putstr(const char *s)
39 47
40#endif 48#endif
41 49
42#ifdef CONFIG_EARLY_PRINTK 50#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
43
44/* cmdline.c */ 51/* cmdline.c */
45int cmdline_find_option(const char *option, char *buffer, int bufsize); 52int cmdline_find_option(const char *option, char *buffer, int bufsize);
46int cmdline_find_option_bool(const char *option); 53int cmdline_find_option_bool(const char *option);
54#endif
47 55
48/* early_serial_console.c */
49extern int early_serial_base;
50void console_init(void);
51 56
57#if CONFIG_RANDOMIZE_BASE
58/* aslr.c */
59unsigned char *choose_kernel_location(unsigned char *input,
60 unsigned long input_size,
61 unsigned char *output,
62 unsigned long output_size);
63/* cpuflags.c */
64bool has_cpuflag(int flag);
52#else 65#else
66static inline
67unsigned char *choose_kernel_location(unsigned char *input,
68 unsigned long input_size,
69 unsigned char *output,
70 unsigned long output_size)
71{
72 return output;
73}
74#endif
53 75
76#ifdef CONFIG_EARLY_PRINTK
54/* early_serial_console.c */ 77/* early_serial_console.c */
78extern int early_serial_base;
79void console_init(void);
80#else
55static const int early_serial_base; 81static const int early_serial_base;
56static inline void console_init(void) 82static inline void console_init(void)
57{ } 83{ }
58
59#endif 84#endif
60 85
61#endif 86#endif
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 11f272c6f5e9..1eb7d298b47d 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -14,7 +14,7 @@
14 * Memory copy routines 14 * Memory copy routines
15 */ 15 */
16 16
17 .code16gcc 17 .code16
18 .text 18 .text
19 19
20GLOBAL(memcpy) 20GLOBAL(memcpy)
@@ -30,7 +30,7 @@ GLOBAL(memcpy)
30 rep; movsb 30 rep; movsb
31 popw %di 31 popw %di
32 popw %si 32 popw %si
33 ret 33 retl
34ENDPROC(memcpy) 34ENDPROC(memcpy)
35 35
36GLOBAL(memset) 36GLOBAL(memset)
@@ -45,25 +45,25 @@ GLOBAL(memset)
45 andw $3, %cx 45 andw $3, %cx
46 rep; stosb 46 rep; stosb
47 popw %di 47 popw %di
48 ret 48 retl
49ENDPROC(memset) 49ENDPROC(memset)
50 50
51GLOBAL(copy_from_fs) 51GLOBAL(copy_from_fs)
52 pushw %ds 52 pushw %ds
53 pushw %fs 53 pushw %fs
54 popw %ds 54 popw %ds
55 call memcpy 55 calll memcpy
56 popw %ds 56 popw %ds
57 ret 57 retl
58ENDPROC(copy_from_fs) 58ENDPROC(copy_from_fs)
59 59
60GLOBAL(copy_to_fs) 60GLOBAL(copy_to_fs)
61 pushw %es 61 pushw %es
62 pushw %fs 62 pushw %fs
63 popw %es 63 popw %es
64 call memcpy 64 calll memcpy
65 popw %es 65 popw %es
66 ret 66 retl
67ENDPROC(copy_to_fs) 67ENDPROC(copy_to_fs)
68 68
69#if 0 /* Not currently used, but can be enabled as needed */ 69#if 0 /* Not currently used, but can be enabled as needed */
@@ -71,17 +71,17 @@ GLOBAL(copy_from_gs)
71 pushw %ds 71 pushw %ds
72 pushw %gs 72 pushw %gs
73 popw %ds 73 popw %ds
74 call memcpy 74 calll memcpy
75 popw %ds 75 popw %ds
76 ret 76 retl
77ENDPROC(copy_from_gs) 77ENDPROC(copy_from_gs)
78 78
79GLOBAL(copy_to_gs) 79GLOBAL(copy_to_gs)
80 pushw %es 80 pushw %es
81 pushw %gs 81 pushw %gs
82 popw %es 82 popw %es
83 call memcpy 83 calll memcpy
84 popw %es 84 popw %es
85 ret 85 retl
86ENDPROC(copy_to_gs) 86ENDPROC(copy_to_gs)
87#endif 87#endif
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4d3ff037201f..100a9a10076a 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -28,8 +28,6 @@
28#include <asm/required-features.h> 28#include <asm/required-features.h>
29#include <asm/msr-index.h> 29#include <asm/msr-index.h>
30 30
31struct cpu_features cpu;
32static u32 cpu_vendor[3];
33static u32 err_flags[NCAPINTS]; 31static u32 err_flags[NCAPINTS];
34 32
35static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY; 33static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
@@ -69,92 +67,8 @@ static int is_transmeta(void)
69 cpu_vendor[2] == A32('M', 'x', '8', '6'); 67 cpu_vendor[2] == A32('M', 'x', '8', '6');
70} 68}
71 69
72static int has_fpu(void)
73{
74 u16 fcw = -1, fsw = -1;
75 u32 cr0;
76
77 asm("movl %%cr0,%0" : "=r" (cr0));
78 if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
79 cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
80 asm volatile("movl %0,%%cr0" : : "r" (cr0));
81 }
82
83 asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
84 : "+m" (fsw), "+m" (fcw));
85
86 return fsw == 0 && (fcw & 0x103f) == 0x003f;
87}
88
89static int has_eflag(u32 mask)
90{
91 u32 f0, f1;
92
93 asm("pushfl ; "
94 "pushfl ; "
95 "popl %0 ; "
96 "movl %0,%1 ; "
97 "xorl %2,%1 ; "
98 "pushl %1 ; "
99 "popfl ; "
100 "pushfl ; "
101 "popl %1 ; "
102 "popfl"
103 : "=&r" (f0), "=&r" (f1)
104 : "ri" (mask));
105
106 return !!((f0^f1) & mask);
107}
108
109static void get_flags(void)
110{
111 u32 max_intel_level, max_amd_level;
112 u32 tfms;
113
114 if (has_fpu())
115 set_bit(X86_FEATURE_FPU, cpu.flags);
116
117 if (has_eflag(X86_EFLAGS_ID)) {
118 asm("cpuid"
119 : "=a" (max_intel_level),
120 "=b" (cpu_vendor[0]),
121 "=d" (cpu_vendor[1]),
122 "=c" (cpu_vendor[2])
123 : "a" (0));
124
125 if (max_intel_level >= 0x00000001 &&
126 max_intel_level <= 0x0000ffff) {
127 asm("cpuid"
128 : "=a" (tfms),
129 "=c" (cpu.flags[4]),
130 "=d" (cpu.flags[0])
131 : "a" (0x00000001)
132 : "ebx");
133 cpu.level = (tfms >> 8) & 15;
134 cpu.model = (tfms >> 4) & 15;
135 if (cpu.level >= 6)
136 cpu.model += ((tfms >> 16) & 0xf) << 4;
137 }
138
139 asm("cpuid"
140 : "=a" (max_amd_level)
141 : "a" (0x80000000)
142 : "ebx", "ecx", "edx");
143
144 if (max_amd_level >= 0x80000001 &&
145 max_amd_level <= 0x8000ffff) {
146 u32 eax = 0x80000001;
147 asm("cpuid"
148 : "+a" (eax),
149 "=c" (cpu.flags[6]),
150 "=d" (cpu.flags[1])
151 : : "ebx");
152 }
153 }
154}
155
156/* Returns a bitmask of which words we have error bits in */ 70/* Returns a bitmask of which words we have error bits in */
157static int check_flags(void) 71static int check_cpuflags(void)
158{ 72{
159 u32 err; 73 u32 err;
160 int i; 74 int i;
@@ -187,8 +101,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
187 if (has_eflag(X86_EFLAGS_AC)) 101 if (has_eflag(X86_EFLAGS_AC))
188 cpu.level = 4; 102 cpu.level = 4;
189 103
190 get_flags(); 104 get_cpuflags();
191 err = check_flags(); 105 err = check_cpuflags();
192 106
193 if (test_bit(X86_FEATURE_LM, cpu.flags)) 107 if (test_bit(X86_FEATURE_LM, cpu.flags))
194 cpu.level = 64; 108 cpu.level = 64;
@@ -207,8 +121,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
207 eax &= ~(1 << 15); 121 eax &= ~(1 << 15);
208 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); 122 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
209 123
210 get_flags(); /* Make sure it really did something */ 124 get_cpuflags(); /* Make sure it really did something */
211 err = check_flags(); 125 err = check_cpuflags();
212 } else if (err == 0x01 && 126 } else if (err == 0x01 &&
213 !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) && 127 !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
214 is_centaur() && cpu.model >= 6) { 128 is_centaur() && cpu.model >= 6) {
@@ -223,7 +137,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
223 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); 137 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
224 138
225 set_bit(X86_FEATURE_CX8, cpu.flags); 139 set_bit(X86_FEATURE_CX8, cpu.flags);
226 err = check_flags(); 140 err = check_cpuflags();
227 } else if (err == 0x01 && is_transmeta()) { 141 } else if (err == 0x01 && is_transmeta()) {
228 /* Transmeta might have masked feature bits in word 0 */ 142 /* Transmeta might have masked feature bits in word 0 */
229 143
@@ -238,7 +152,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
238 : : "ecx", "ebx"); 152 : : "ecx", "ebx");
239 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); 153 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
240 154
241 err = check_flags(); 155 err = check_cpuflags();
242 } 156 }
243 157
244 if (err_flags_ptr) 158 if (err_flags_ptr)
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
new file mode 100644
index 000000000000..a9fcb7cfb241
--- /dev/null
+++ b/arch/x86/boot/cpuflags.c
@@ -0,0 +1,104 @@
1#include <linux/types.h>
2#include "bitops.h"
3
4#include <asm/processor-flags.h>
5#include <asm/required-features.h>
6#include <asm/msr-index.h>
7#include "cpuflags.h"
8
9struct cpu_features cpu;
10u32 cpu_vendor[3];
11
12static bool loaded_flags;
13
14static int has_fpu(void)
15{
16 u16 fcw = -1, fsw = -1;
17 unsigned long cr0;
18
19 asm volatile("mov %%cr0,%0" : "=r" (cr0));
20 if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
21 cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
22 asm volatile("mov %0,%%cr0" : : "r" (cr0));
23 }
24
25 asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
26 : "+m" (fsw), "+m" (fcw));
27
28 return fsw == 0 && (fcw & 0x103f) == 0x003f;
29}
30
31int has_eflag(unsigned long mask)
32{
33 unsigned long f0, f1;
34
35 asm volatile("pushf \n\t"
36 "pushf \n\t"
37 "pop %0 \n\t"
38 "mov %0,%1 \n\t"
39 "xor %2,%1 \n\t"
40 "push %1 \n\t"
41 "popf \n\t"
42 "pushf \n\t"
43 "pop %1 \n\t"
44 "popf"
45 : "=&r" (f0), "=&r" (f1)
46 : "ri" (mask));
47
48 return !!((f0^f1) & mask);
49}
50
51/* Handle x86_32 PIC using ebx. */
52#if defined(__i386__) && defined(__PIC__)
53# define EBX_REG "=r"
54#else
55# define EBX_REG "=b"
56#endif
57
58static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
59{
60 asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
61 "cpuid \n\t"
62 ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
63 : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
64 : "a" (id)
65 );
66}
67
68void get_cpuflags(void)
69{
70 u32 max_intel_level, max_amd_level;
71 u32 tfms;
72 u32 ignored;
73
74 if (loaded_flags)
75 return;
76 loaded_flags = true;
77
78 if (has_fpu())
79 set_bit(X86_FEATURE_FPU, cpu.flags);
80
81 if (has_eflag(X86_EFLAGS_ID)) {
82 cpuid(0x0, &max_intel_level, &cpu_vendor[0], &cpu_vendor[2],
83 &cpu_vendor[1]);
84
85 if (max_intel_level >= 0x00000001 &&
86 max_intel_level <= 0x0000ffff) {
87 cpuid(0x1, &tfms, &ignored, &cpu.flags[4],
88 &cpu.flags[0]);
89 cpu.level = (tfms >> 8) & 15;
90 cpu.model = (tfms >> 4) & 15;
91 if (cpu.level >= 6)
92 cpu.model += ((tfms >> 16) & 0xf) << 4;
93 }
94
95 cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
96 &ignored);
97
98 if (max_amd_level >= 0x80000001 &&
99 max_amd_level <= 0x8000ffff) {
100 cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6],
101 &cpu.flags[1]);
102 }
103 }
104}
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
new file mode 100644
index 000000000000..ea97697e51e4
--- /dev/null
+++ b/arch/x86/boot/cpuflags.h
@@ -0,0 +1,19 @@
1#ifndef BOOT_CPUFLAGS_H
2#define BOOT_CPUFLAGS_H
3
4#include <asm/cpufeature.h>
5#include <asm/processor-flags.h>
6
7struct cpu_features {
8 int level; /* Family, or 64 for x86-64 */
9 int model;
10 u32 flags[NCAPINTS];
11};
12
13extern struct cpu_features cpu;
14extern u32 cpu_vendor[3];
15
16int has_eflag(unsigned long mask);
17void get_cpuflags(void);
18
19#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 9ec06a1f6d61..ec3b8ba68096 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -391,7 +391,14 @@ xloadflags:
391#else 391#else
392# define XLF23 0 392# define XLF23 0
393#endif 393#endif
394 .word XLF0 | XLF1 | XLF23 394
395#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
396# define XLF4 XLF_EFI_KEXEC
397#else
398# define XLF4 0
399#endif
400
401 .word XLF0 | XLF1 | XLF23 | XLF4
395 402
396cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, 403cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
397 #added with boot protocol 404 #added with boot protocol
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e0fc24db234a..6ba54d640383 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -76,6 +76,7 @@ ifeq ($(avx2_supported),yes)
76endif 76endif
77 77
78aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 78aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
79aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o
79ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 80ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
80sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 81sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
81crc32c-intel-y := crc32c-intel_glue.o 82crc32c-intel-y := crc32c-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
new file mode 100644
index 000000000000..522ab68d1c88
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -0,0 +1,2811 @@
1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses. You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15# notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18# notice, this list of conditions and the following disclaimer in the
19# documentation and/or other materials provided with the
20# distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23# contributors may be used to endorse or promote products derived from
24# this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41## Erdinc Ozturk <erdinc.ozturk@intel.com>
42## Vinodh Gopal <vinodh.gopal@intel.com>
43## James Guilford <james.guilford@intel.com>
44## Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47## This code was derived and highly optimized from the code described in paper:
48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49## on Intel Architecture Processors. August, 2010
50## The details of the implementation is explained in:
51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52## on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59## 0 1 2 3
60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62## | Salt (From the SA) |
63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64## | Initialization Vector |
65## | (This is the sequence number from IPSec header) |
66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67## | 0x1 |
68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73## AAD padded to 128 bits with 0
74## for example, assume AAD is a u32 vector
75##
76## if AAD is 8 bytes:
77## AAD[3] = {A0, A1}#
78## padded AAD in xmm register = {A1 A0 0 0}
79##
80## 0 1 2 3
81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83## | SPI (A1) |
84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85## | 32-bit Sequence Number (A0) |
86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87## | 0x0 |
88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90## AAD Format with 32-bit Sequence Number
91##
92## if AAD is 12 bytes:
93## AAD[3] = {A0, A1, A2}#
94## padded AAD in xmm register = {A2 A1 A0 0}
95##
96## 0 1 2 3
97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99## | SPI (A2) |
100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101## | 64-bit Extended Sequence Number {A1,A0} |
102## | |
103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104## | 0x0 |
105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107## AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112## The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125.data
126.align 16
127
128POLY: .octa 0xC2000000000000000000000000000001
129POLY2: .octa 0xC20000000000000000000001C2000000
130TWOONE: .octa 0x00000001000000000000000000000001
131
132# order of these constants should not change.
133# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
134
135SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
136SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
137ALL_F: .octa 0xffffffffffffffffffffffffffffffff
138ZERO: .octa 0x00000000000000000000000000000000
139ONE: .octa 0x00000000000000000000000000000001
140ONEf: .octa 0x01000000000000000000000000000000
141
142.text
143
144
145##define the fields of the gcm aes context
146#{
147# u8 expanded_keys[16*11] store expanded keys
148# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
149# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
150# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
151# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
152# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
153# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
154# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
155# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
156# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
157# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
158# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
159# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
160# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
161# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
162# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
163# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
164#} gcm_ctx#
165
166HashKey = 16*11 # store HashKey <<1 mod poly here
167HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
168HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
169HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
170HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
171HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
172HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
173HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
174HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
175HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
176HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
177HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
178HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
179HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
180HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
181HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
182
183#define arg1 %rdi
184#define arg2 %rsi
185#define arg3 %rdx
186#define arg4 %rcx
187#define arg5 %r8
188#define arg6 %r9
189#define arg7 STACK_OFFSET+8*1(%r14)
190#define arg8 STACK_OFFSET+8*2(%r14)
191#define arg9 STACK_OFFSET+8*3(%r14)
192
193i = 0
194j = 0
195
196out_order = 0
197in_order = 1
198DEC = 0
199ENC = 1
200
201.macro define_reg r n
202reg_\r = %xmm\n
203.endm
204
205.macro setreg
206.altmacro
207define_reg i %i
208define_reg j %j
209.noaltmacro
210.endm
211
212# need to push 4 registers into stack to maintain
213STACK_OFFSET = 8*4
214
215TMP1 = 16*0 # Temporary storage for AAD
216TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
217TMP3 = 16*2 # Temporary storage for AES State 3
218TMP4 = 16*3 # Temporary storage for AES State 4
219TMP5 = 16*4 # Temporary storage for AES State 5
220TMP6 = 16*5 # Temporary storage for AES State 6
221TMP7 = 16*6 # Temporary storage for AES State 7
222TMP8 = 16*7 # Temporary storage for AES State 8
223
224VARIABLE_OFFSET = 16*8
225
226################################
227# Utility Macros
228################################
229
230# Encryption of a single block
231.macro ENCRYPT_SINGLE_BLOCK XMM0
232 vpxor (arg1), \XMM0, \XMM0
233 i = 1
234 setreg
235.rep 9
236 vaesenc 16*i(arg1), \XMM0, \XMM0
237 i = (i+1)
238 setreg
239.endr
240 vaesenclast 16*10(arg1), \XMM0, \XMM0
241.endm
242
243#ifdef CONFIG_AS_AVX
244###############################################################################
245# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
246# Input: A and B (128-bits each, bit-reflected)
247# Output: C = A*B*x mod poly, (i.e. >>1 )
248# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
249# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
250###############################################################################
251.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
252
253 vpshufd $0b01001110, \GH, \T2
254 vpshufd $0b01001110, \HK, \T3
255 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
256 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
257
258 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
259 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
260 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
261 vpxor \GH, \T2,\T2
262 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
263
264 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
265 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
266 vpxor \T3, \GH, \GH
267 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
268
269 #first phase of the reduction
270 vpslld $31, \GH, \T2 # packed right shifting << 31
271 vpslld $30, \GH, \T3 # packed right shifting shift << 30
272 vpslld $25, \GH, \T4 # packed right shifting shift << 25
273
274 vpxor \T3, \T2, \T2 # xor the shifted versions
275 vpxor \T4, \T2, \T2
276
277 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
278
279 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
280 vpxor \T2, \GH, \GH # first phase of the reduction complete
281
282 #second phase of the reduction
283
284 vpsrld $1,\GH, \T2 # packed left shifting >> 1
285 vpsrld $2,\GH, \T3 # packed left shifting >> 2
286 vpsrld $7,\GH, \T4 # packed left shifting >> 7
287 vpxor \T3, \T2, \T2 # xor the shifted versions
288 vpxor \T4, \T2, \T2
289
290 vpxor \T5, \T2, \T2
291 vpxor \T2, \GH, \GH
292 vpxor \T1, \GH, \GH # the result is in GH
293
294
295.endm
296
297.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
298
299 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
300 vmovdqa \HK, \T5
301
302 vpshufd $0b01001110, \T5, \T1
303 vpxor \T5, \T1, \T1
304 vmovdqa \T1, HashKey_k(arg1)
305
306 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
307 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
308 vpshufd $0b01001110, \T5, \T1
309 vpxor \T5, \T1, \T1
310 vmovdqa \T1, HashKey_2_k(arg1)
311
312 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
313 vmovdqa \T5, HashKey_3(arg1)
314 vpshufd $0b01001110, \T5, \T1
315 vpxor \T5, \T1, \T1
316 vmovdqa \T1, HashKey_3_k(arg1)
317
318 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
319 vmovdqa \T5, HashKey_4(arg1)
320 vpshufd $0b01001110, \T5, \T1
321 vpxor \T5, \T1, \T1
322 vmovdqa \T1, HashKey_4_k(arg1)
323
324 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
325 vmovdqa \T5, HashKey_5(arg1)
326 vpshufd $0b01001110, \T5, \T1
327 vpxor \T5, \T1, \T1
328 vmovdqa \T1, HashKey_5_k(arg1)
329
330 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
331 vmovdqa \T5, HashKey_6(arg1)
332 vpshufd $0b01001110, \T5, \T1
333 vpxor \T5, \T1, \T1
334 vmovdqa \T1, HashKey_6_k(arg1)
335
336 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
337 vmovdqa \T5, HashKey_7(arg1)
338 vpshufd $0b01001110, \T5, \T1
339 vpxor \T5, \T1, \T1
340 vmovdqa \T1, HashKey_7_k(arg1)
341
342 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
343 vmovdqa \T5, HashKey_8(arg1)
344 vpshufd $0b01001110, \T5, \T1
345 vpxor \T5, \T1, \T1
346 vmovdqa \T1, HashKey_8_k(arg1)
347
348.endm
349
350## if a = number of total plaintext bytes
351## b = floor(a/16)
352## num_initial_blocks = b mod 4#
353## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
354## r10, r11, r12, rax are clobbered
355## arg1, arg2, arg3, r14 are used as a pointer only, not modified
356
357.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
358 i = (8-\num_initial_blocks)
359 setreg
360
361 mov arg6, %r10 # r10 = AAD
362 mov arg7, %r12 # r12 = aadLen
363
364
365 mov %r12, %r11
366
367 vpxor reg_i, reg_i, reg_i
368_get_AAD_loop\@:
369 vmovd (%r10), \T1
370 vpslldq $12, \T1, \T1
371 vpsrldq $4, reg_i, reg_i
372 vpxor \T1, reg_i, reg_i
373
374 add $4, %r10
375 sub $4, %r12
376 jg _get_AAD_loop\@
377
378
379 cmp $16, %r11
380 je _get_AAD_loop2_done\@
381 mov $16, %r12
382
383_get_AAD_loop2\@:
384 vpsrldq $4, reg_i, reg_i
385 sub $4, %r12
386 cmp %r11, %r12
387 jg _get_AAD_loop2\@
388
389_get_AAD_loop2_done\@:
390
391 #byte-reflect the AAD data
392 vpshufb SHUF_MASK(%rip), reg_i, reg_i
393
394 # initialize the data pointer offset as zero
395 xor %r11, %r11
396
397 # start AES for num_initial_blocks blocks
398 mov arg5, %rax # rax = *Y0
399 vmovdqu (%rax), \CTR # CTR = Y0
400 vpshufb SHUF_MASK(%rip), \CTR, \CTR
401
402
403 i = (9-\num_initial_blocks)
404 setreg
405.rep \num_initial_blocks
406 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
407 vmovdqa \CTR, reg_i
408 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
409 i = (i+1)
410 setreg
411.endr
412
413 vmovdqa (arg1), \T_key
414 i = (9-\num_initial_blocks)
415 setreg
416.rep \num_initial_blocks
417 vpxor \T_key, reg_i, reg_i
418 i = (i+1)
419 setreg
420.endr
421
422 j = 1
423 setreg
424.rep 9
425 vmovdqa 16*j(arg1), \T_key
426 i = (9-\num_initial_blocks)
427 setreg
428.rep \num_initial_blocks
429 vaesenc \T_key, reg_i, reg_i
430 i = (i+1)
431 setreg
432.endr
433
434 j = (j+1)
435 setreg
436.endr
437
438
439 vmovdqa 16*10(arg1), \T_key
440 i = (9-\num_initial_blocks)
441 setreg
442.rep \num_initial_blocks
443 vaesenclast \T_key, reg_i, reg_i
444 i = (i+1)
445 setreg
446.endr
447
448 i = (9-\num_initial_blocks)
449 setreg
450.rep \num_initial_blocks
451 vmovdqu (arg3, %r11), \T1
452 vpxor \T1, reg_i, reg_i
453 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
454 add $16, %r11
455.if \ENC_DEC == DEC
456 vmovdqa \T1, reg_i
457.endif
458 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
459 i = (i+1)
460 setreg
461.endr
462
463
464 i = (8-\num_initial_blocks)
465 j = (9-\num_initial_blocks)
466 setreg
467 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
468
469.rep \num_initial_blocks
470 vpxor reg_i, reg_j, reg_j
471 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
472 i = (i+1)
473 j = (j+1)
474 setreg
475.endr
476 # XMM8 has the combined result here
477
478 vmovdqa \XMM8, TMP1(%rsp)
479 vmovdqa \XMM8, \T3
480
481 cmp $128, %r13
482 jl _initial_blocks_done\@ # no need for precomputed constants
483
484###############################################################################
485# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
486 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
487 vmovdqa \CTR, \XMM1
488 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
489
490 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
491 vmovdqa \CTR, \XMM2
492 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
493
494 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
495 vmovdqa \CTR, \XMM3
496 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
497
498 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
499 vmovdqa \CTR, \XMM4
500 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
501
502 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
503 vmovdqa \CTR, \XMM5
504 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
505
506 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
507 vmovdqa \CTR, \XMM6
508 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
509
510 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
511 vmovdqa \CTR, \XMM7
512 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
513
514 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
515 vmovdqa \CTR, \XMM8
516 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
517
518 vmovdqa (arg1), \T_key
519 vpxor \T_key, \XMM1, \XMM1
520 vpxor \T_key, \XMM2, \XMM2
521 vpxor \T_key, \XMM3, \XMM3
522 vpxor \T_key, \XMM4, \XMM4
523 vpxor \T_key, \XMM5, \XMM5
524 vpxor \T_key, \XMM6, \XMM6
525 vpxor \T_key, \XMM7, \XMM7
526 vpxor \T_key, \XMM8, \XMM8
527
528 i = 1
529 setreg
530.rep 9 # do 9 rounds
531 vmovdqa 16*i(arg1), \T_key
532 vaesenc \T_key, \XMM1, \XMM1
533 vaesenc \T_key, \XMM2, \XMM2
534 vaesenc \T_key, \XMM3, \XMM3
535 vaesenc \T_key, \XMM4, \XMM4
536 vaesenc \T_key, \XMM5, \XMM5
537 vaesenc \T_key, \XMM6, \XMM6
538 vaesenc \T_key, \XMM7, \XMM7
539 vaesenc \T_key, \XMM8, \XMM8
540 i = (i+1)
541 setreg
542.endr
543
544
545 vmovdqa 16*i(arg1), \T_key
546 vaesenclast \T_key, \XMM1, \XMM1
547 vaesenclast \T_key, \XMM2, \XMM2
548 vaesenclast \T_key, \XMM3, \XMM3
549 vaesenclast \T_key, \XMM4, \XMM4
550 vaesenclast \T_key, \XMM5, \XMM5
551 vaesenclast \T_key, \XMM6, \XMM6
552 vaesenclast \T_key, \XMM7, \XMM7
553 vaesenclast \T_key, \XMM8, \XMM8
554
555 vmovdqu (arg3, %r11), \T1
556 vpxor \T1, \XMM1, \XMM1
557 vmovdqu \XMM1, (arg2 , %r11)
558 .if \ENC_DEC == DEC
559 vmovdqa \T1, \XMM1
560 .endif
561
562 vmovdqu 16*1(arg3, %r11), \T1
563 vpxor \T1, \XMM2, \XMM2
564 vmovdqu \XMM2, 16*1(arg2 , %r11)
565 .if \ENC_DEC == DEC
566 vmovdqa \T1, \XMM2
567 .endif
568
569 vmovdqu 16*2(arg3, %r11), \T1
570 vpxor \T1, \XMM3, \XMM3
571 vmovdqu \XMM3, 16*2(arg2 , %r11)
572 .if \ENC_DEC == DEC
573 vmovdqa \T1, \XMM3
574 .endif
575
576 vmovdqu 16*3(arg3, %r11), \T1
577 vpxor \T1, \XMM4, \XMM4
578 vmovdqu \XMM4, 16*3(arg2 , %r11)
579 .if \ENC_DEC == DEC
580 vmovdqa \T1, \XMM4
581 .endif
582
583 vmovdqu 16*4(arg3, %r11), \T1
584 vpxor \T1, \XMM5, \XMM5
585 vmovdqu \XMM5, 16*4(arg2 , %r11)
586 .if \ENC_DEC == DEC
587 vmovdqa \T1, \XMM5
588 .endif
589
590 vmovdqu 16*5(arg3, %r11), \T1
591 vpxor \T1, \XMM6, \XMM6
592 vmovdqu \XMM6, 16*5(arg2 , %r11)
593 .if \ENC_DEC == DEC
594 vmovdqa \T1, \XMM6
595 .endif
596
597 vmovdqu 16*6(arg3, %r11), \T1
598 vpxor \T1, \XMM7, \XMM7
599 vmovdqu \XMM7, 16*6(arg2 , %r11)
600 .if \ENC_DEC == DEC
601 vmovdqa \T1, \XMM7
602 .endif
603
604 vmovdqu 16*7(arg3, %r11), \T1
605 vpxor \T1, \XMM8, \XMM8
606 vmovdqu \XMM8, 16*7(arg2 , %r11)
607 .if \ENC_DEC == DEC
608 vmovdqa \T1, \XMM8
609 .endif
610
611 add $128, %r11
612
613 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
614 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
615 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
616 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
617 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
618 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
619 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
620 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
621 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
622
623###############################################################################
624
625_initial_blocks_done\@:
626
627.endm
628
629# encrypt 8 blocks at a time
630# ghash the 8 previously encrypted ciphertext blocks
631# arg1, arg2, arg3 are used as pointers only, not modified
632# r11 is the data offset value
633.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
634
635 vmovdqa \XMM1, \T2
636 vmovdqa \XMM2, TMP2(%rsp)
637 vmovdqa \XMM3, TMP3(%rsp)
638 vmovdqa \XMM4, TMP4(%rsp)
639 vmovdqa \XMM5, TMP5(%rsp)
640 vmovdqa \XMM6, TMP6(%rsp)
641 vmovdqa \XMM7, TMP7(%rsp)
642 vmovdqa \XMM8, TMP8(%rsp)
643
644.if \loop_idx == in_order
645 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
646 vpaddd ONE(%rip), \XMM1, \XMM2
647 vpaddd ONE(%rip), \XMM2, \XMM3
648 vpaddd ONE(%rip), \XMM3, \XMM4
649 vpaddd ONE(%rip), \XMM4, \XMM5
650 vpaddd ONE(%rip), \XMM5, \XMM6
651 vpaddd ONE(%rip), \XMM6, \XMM7
652 vpaddd ONE(%rip), \XMM7, \XMM8
653 vmovdqa \XMM8, \CTR
654
655 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
656 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
657 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
658 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
659 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
660 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
661 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
662 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
663.else
664 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
665 vpaddd ONEf(%rip), \XMM1, \XMM2
666 vpaddd ONEf(%rip), \XMM2, \XMM3
667 vpaddd ONEf(%rip), \XMM3, \XMM4
668 vpaddd ONEf(%rip), \XMM4, \XMM5
669 vpaddd ONEf(%rip), \XMM5, \XMM6
670 vpaddd ONEf(%rip), \XMM6, \XMM7
671 vpaddd ONEf(%rip), \XMM7, \XMM8
672 vmovdqa \XMM8, \CTR
673.endif
674
675
676 #######################################################################
677
678 vmovdqu (arg1), \T1
679 vpxor \T1, \XMM1, \XMM1
680 vpxor \T1, \XMM2, \XMM2
681 vpxor \T1, \XMM3, \XMM3
682 vpxor \T1, \XMM4, \XMM4
683 vpxor \T1, \XMM5, \XMM5
684 vpxor \T1, \XMM6, \XMM6
685 vpxor \T1, \XMM7, \XMM7
686 vpxor \T1, \XMM8, \XMM8
687
688 #######################################################################
689
690
691
692
693
694 vmovdqu 16*1(arg1), \T1
695 vaesenc \T1, \XMM1, \XMM1
696 vaesenc \T1, \XMM2, \XMM2
697 vaesenc \T1, \XMM3, \XMM3
698 vaesenc \T1, \XMM4, \XMM4
699 vaesenc \T1, \XMM5, \XMM5
700 vaesenc \T1, \XMM6, \XMM6
701 vaesenc \T1, \XMM7, \XMM7
702 vaesenc \T1, \XMM8, \XMM8
703
704 vmovdqu 16*2(arg1), \T1
705 vaesenc \T1, \XMM1, \XMM1
706 vaesenc \T1, \XMM2, \XMM2
707 vaesenc \T1, \XMM3, \XMM3
708 vaesenc \T1, \XMM4, \XMM4
709 vaesenc \T1, \XMM5, \XMM5
710 vaesenc \T1, \XMM6, \XMM6
711 vaesenc \T1, \XMM7, \XMM7
712 vaesenc \T1, \XMM8, \XMM8
713
714
715 #######################################################################
716
717 vmovdqa HashKey_8(arg1), \T5
718 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
719 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
720
721 vpshufd $0b01001110, \T2, \T6
722 vpxor \T2, \T6, \T6
723
724 vmovdqa HashKey_8_k(arg1), \T5
725 vpclmulqdq $0x00, \T5, \T6, \T6
726
727 vmovdqu 16*3(arg1), \T1
728 vaesenc \T1, \XMM1, \XMM1
729 vaesenc \T1, \XMM2, \XMM2
730 vaesenc \T1, \XMM3, \XMM3
731 vaesenc \T1, \XMM4, \XMM4
732 vaesenc \T1, \XMM5, \XMM5
733 vaesenc \T1, \XMM6, \XMM6
734 vaesenc \T1, \XMM7, \XMM7
735 vaesenc \T1, \XMM8, \XMM8
736
737 vmovdqa TMP2(%rsp), \T1
738 vmovdqa HashKey_7(arg1), \T5
739 vpclmulqdq $0x11, \T5, \T1, \T3
740 vpxor \T3, \T4, \T4
741 vpclmulqdq $0x00, \T5, \T1, \T3
742 vpxor \T3, \T7, \T7
743
744 vpshufd $0b01001110, \T1, \T3
745 vpxor \T1, \T3, \T3
746 vmovdqa HashKey_7_k(arg1), \T5
747 vpclmulqdq $0x10, \T5, \T3, \T3
748 vpxor \T3, \T6, \T6
749
750 vmovdqu 16*4(arg1), \T1
751 vaesenc \T1, \XMM1, \XMM1
752 vaesenc \T1, \XMM2, \XMM2
753 vaesenc \T1, \XMM3, \XMM3
754 vaesenc \T1, \XMM4, \XMM4
755 vaesenc \T1, \XMM5, \XMM5
756 vaesenc \T1, \XMM6, \XMM6
757 vaesenc \T1, \XMM7, \XMM7
758 vaesenc \T1, \XMM8, \XMM8
759
760 #######################################################################
761
762 vmovdqa TMP3(%rsp), \T1
763 vmovdqa HashKey_6(arg1), \T5
764 vpclmulqdq $0x11, \T5, \T1, \T3
765 vpxor \T3, \T4, \T4
766 vpclmulqdq $0x00, \T5, \T1, \T3
767 vpxor \T3, \T7, \T7
768
769 vpshufd $0b01001110, \T1, \T3
770 vpxor \T1, \T3, \T3
771 vmovdqa HashKey_6_k(arg1), \T5
772 vpclmulqdq $0x10, \T5, \T3, \T3
773 vpxor \T3, \T6, \T6
774
775 vmovdqu 16*5(arg1), \T1
776 vaesenc \T1, \XMM1, \XMM1
777 vaesenc \T1, \XMM2, \XMM2
778 vaesenc \T1, \XMM3, \XMM3
779 vaesenc \T1, \XMM4, \XMM4
780 vaesenc \T1, \XMM5, \XMM5
781 vaesenc \T1, \XMM6, \XMM6
782 vaesenc \T1, \XMM7, \XMM7
783 vaesenc \T1, \XMM8, \XMM8
784
785 vmovdqa TMP4(%rsp), \T1
786 vmovdqa HashKey_5(arg1), \T5
787 vpclmulqdq $0x11, \T5, \T1, \T3
788 vpxor \T3, \T4, \T4
789 vpclmulqdq $0x00, \T5, \T1, \T3
790 vpxor \T3, \T7, \T7
791
792 vpshufd $0b01001110, \T1, \T3
793 vpxor \T1, \T3, \T3
794 vmovdqa HashKey_5_k(arg1), \T5
795 vpclmulqdq $0x10, \T5, \T3, \T3
796 vpxor \T3, \T6, \T6
797
798 vmovdqu 16*6(arg1), \T1
799 vaesenc \T1, \XMM1, \XMM1
800 vaesenc \T1, \XMM2, \XMM2
801 vaesenc \T1, \XMM3, \XMM3
802 vaesenc \T1, \XMM4, \XMM4
803 vaesenc \T1, \XMM5, \XMM5
804 vaesenc \T1, \XMM6, \XMM6
805 vaesenc \T1, \XMM7, \XMM7
806 vaesenc \T1, \XMM8, \XMM8
807
808
809 vmovdqa TMP5(%rsp), \T1
810 vmovdqa HashKey_4(arg1), \T5
811 vpclmulqdq $0x11, \T5, \T1, \T3
812 vpxor \T3, \T4, \T4
813 vpclmulqdq $0x00, \T5, \T1, \T3
814 vpxor \T3, \T7, \T7
815
816 vpshufd $0b01001110, \T1, \T3
817 vpxor \T1, \T3, \T3
818 vmovdqa HashKey_4_k(arg1), \T5
819 vpclmulqdq $0x10, \T5, \T3, \T3
820 vpxor \T3, \T6, \T6
821
822 vmovdqu 16*7(arg1), \T1
823 vaesenc \T1, \XMM1, \XMM1
824 vaesenc \T1, \XMM2, \XMM2
825 vaesenc \T1, \XMM3, \XMM3
826 vaesenc \T1, \XMM4, \XMM4
827 vaesenc \T1, \XMM5, \XMM5
828 vaesenc \T1, \XMM6, \XMM6
829 vaesenc \T1, \XMM7, \XMM7
830 vaesenc \T1, \XMM8, \XMM8
831
832 vmovdqa TMP6(%rsp), \T1
833 vmovdqa HashKey_3(arg1), \T5
834 vpclmulqdq $0x11, \T5, \T1, \T3
835 vpxor \T3, \T4, \T4
836 vpclmulqdq $0x00, \T5, \T1, \T3
837 vpxor \T3, \T7, \T7
838
839 vpshufd $0b01001110, \T1, \T3
840 vpxor \T1, \T3, \T3
841 vmovdqa HashKey_3_k(arg1), \T5
842 vpclmulqdq $0x10, \T5, \T3, \T3
843 vpxor \T3, \T6, \T6
844
845
846 vmovdqu 16*8(arg1), \T1
847 vaesenc \T1, \XMM1, \XMM1
848 vaesenc \T1, \XMM2, \XMM2
849 vaesenc \T1, \XMM3, \XMM3
850 vaesenc \T1, \XMM4, \XMM4
851 vaesenc \T1, \XMM5, \XMM5
852 vaesenc \T1, \XMM6, \XMM6
853 vaesenc \T1, \XMM7, \XMM7
854 vaesenc \T1, \XMM8, \XMM8
855
856 vmovdqa TMP7(%rsp), \T1
857 vmovdqa HashKey_2(arg1), \T5
858 vpclmulqdq $0x11, \T5, \T1, \T3
859 vpxor \T3, \T4, \T4
860 vpclmulqdq $0x00, \T5, \T1, \T3
861 vpxor \T3, \T7, \T7
862
863 vpshufd $0b01001110, \T1, \T3
864 vpxor \T1, \T3, \T3
865 vmovdqa HashKey_2_k(arg1), \T5
866 vpclmulqdq $0x10, \T5, \T3, \T3
867 vpxor \T3, \T6, \T6
868
869 #######################################################################
870
871 vmovdqu 16*9(arg1), \T5
872 vaesenc \T5, \XMM1, \XMM1
873 vaesenc \T5, \XMM2, \XMM2
874 vaesenc \T5, \XMM3, \XMM3
875 vaesenc \T5, \XMM4, \XMM4
876 vaesenc \T5, \XMM5, \XMM5
877 vaesenc \T5, \XMM6, \XMM6
878 vaesenc \T5, \XMM7, \XMM7
879 vaesenc \T5, \XMM8, \XMM8
880
881 vmovdqa TMP8(%rsp), \T1
882 vmovdqa HashKey(arg1), \T5
883 vpclmulqdq $0x11, \T5, \T1, \T3
884 vpxor \T3, \T4, \T4
885 vpclmulqdq $0x00, \T5, \T1, \T3
886 vpxor \T3, \T7, \T7
887
888 vpshufd $0b01001110, \T1, \T3
889 vpxor \T1, \T3, \T3
890 vmovdqa HashKey_k(arg1), \T5
891 vpclmulqdq $0x10, \T5, \T3, \T3
892 vpxor \T3, \T6, \T6
893
894 vpxor \T4, \T6, \T6
895 vpxor \T7, \T6, \T6
896
897 vmovdqu 16*10(arg1), \T5
898
899 i = 0
900 j = 1
901 setreg
902.rep 8
903 vpxor 16*i(arg3, %r11), \T5, \T2
904 .if \ENC_DEC == ENC
905 vaesenclast \T2, reg_j, reg_j
906 .else
907 vaesenclast \T2, reg_j, \T3
908 vmovdqu 16*i(arg3, %r11), reg_j
909 vmovdqu \T3, 16*i(arg2, %r11)
910 .endif
911 i = (i+1)
912 j = (j+1)
913 setreg
914.endr
915 #######################################################################
916
917
918 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
919 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
920 vpxor \T3, \T7, \T7
921 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
922
923
924
925 #######################################################################
926 #first phase of the reduction
927 #######################################################################
928 vpslld $31, \T7, \T2 # packed right shifting << 31
929 vpslld $30, \T7, \T3 # packed right shifting shift << 30
930 vpslld $25, \T7, \T4 # packed right shifting shift << 25
931
932 vpxor \T3, \T2, \T2 # xor the shifted versions
933 vpxor \T4, \T2, \T2
934
935 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
936
937 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
938 vpxor \T2, \T7, \T7 # first phase of the reduction complete
939 #######################################################################
940 .if \ENC_DEC == ENC
941 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
942 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
943 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
944 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
945 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
946 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
947 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
948 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
949 .endif
950
951 #######################################################################
952 #second phase of the reduction
953 vpsrld $1, \T7, \T2 # packed left shifting >> 1
954 vpsrld $2, \T7, \T3 # packed left shifting >> 2
955 vpsrld $7, \T7, \T4 # packed left shifting >> 7
956 vpxor \T3, \T2, \T2 # xor the shifted versions
957 vpxor \T4, \T2, \T2
958
959 vpxor \T1, \T2, \T2
960 vpxor \T2, \T7, \T7
961 vpxor \T7, \T6, \T6 # the result is in T6
962 #######################################################################
963
964 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
965 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
966 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
967 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
968 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
969 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
970 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
971 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
972
973
974 vpxor \T6, \XMM1, \XMM1
975
976
977
978.endm
979
980
981# GHASH the last 4 ciphertext blocks.
982.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
983
984 ## Karatsuba Method
985
986
987 vpshufd $0b01001110, \XMM1, \T2
988 vpxor \XMM1, \T2, \T2
989 vmovdqa HashKey_8(arg1), \T5
990 vpclmulqdq $0x11, \T5, \XMM1, \T6
991 vpclmulqdq $0x00, \T5, \XMM1, \T7
992
993 vmovdqa HashKey_8_k(arg1), \T3
994 vpclmulqdq $0x00, \T3, \T2, \XMM1
995
996 ######################
997
998 vpshufd $0b01001110, \XMM2, \T2
999 vpxor \XMM2, \T2, \T2
1000 vmovdqa HashKey_7(arg1), \T5
1001 vpclmulqdq $0x11, \T5, \XMM2, \T4
1002 vpxor \T4, \T6, \T6
1003
1004 vpclmulqdq $0x00, \T5, \XMM2, \T4
1005 vpxor \T4, \T7, \T7
1006
1007 vmovdqa HashKey_7_k(arg1), \T3
1008 vpclmulqdq $0x00, \T3, \T2, \T2
1009 vpxor \T2, \XMM1, \XMM1
1010
1011 ######################
1012
1013 vpshufd $0b01001110, \XMM3, \T2
1014 vpxor \XMM3, \T2, \T2
1015 vmovdqa HashKey_6(arg1), \T5
1016 vpclmulqdq $0x11, \T5, \XMM3, \T4
1017 vpxor \T4, \T6, \T6
1018
1019 vpclmulqdq $0x00, \T5, \XMM3, \T4
1020 vpxor \T4, \T7, \T7
1021
1022 vmovdqa HashKey_6_k(arg1), \T3
1023 vpclmulqdq $0x00, \T3, \T2, \T2
1024 vpxor \T2, \XMM1, \XMM1
1025
1026 ######################
1027
1028 vpshufd $0b01001110, \XMM4, \T2
1029 vpxor \XMM4, \T2, \T2
1030 vmovdqa HashKey_5(arg1), \T5
1031 vpclmulqdq $0x11, \T5, \XMM4, \T4
1032 vpxor \T4, \T6, \T6
1033
1034 vpclmulqdq $0x00, \T5, \XMM4, \T4
1035 vpxor \T4, \T7, \T7
1036
1037 vmovdqa HashKey_5_k(arg1), \T3
1038 vpclmulqdq $0x00, \T3, \T2, \T2
1039 vpxor \T2, \XMM1, \XMM1
1040
1041 ######################
1042
1043 vpshufd $0b01001110, \XMM5, \T2
1044 vpxor \XMM5, \T2, \T2
1045 vmovdqa HashKey_4(arg1), \T5
1046 vpclmulqdq $0x11, \T5, \XMM5, \T4
1047 vpxor \T4, \T6, \T6
1048
1049 vpclmulqdq $0x00, \T5, \XMM5, \T4
1050 vpxor \T4, \T7, \T7
1051
1052 vmovdqa HashKey_4_k(arg1), \T3
1053 vpclmulqdq $0x00, \T3, \T2, \T2
1054 vpxor \T2, \XMM1, \XMM1
1055
1056 ######################
1057
1058 vpshufd $0b01001110, \XMM6, \T2
1059 vpxor \XMM6, \T2, \T2
1060 vmovdqa HashKey_3(arg1), \T5
1061 vpclmulqdq $0x11, \T5, \XMM6, \T4
1062 vpxor \T4, \T6, \T6
1063
1064 vpclmulqdq $0x00, \T5, \XMM6, \T4
1065 vpxor \T4, \T7, \T7
1066
1067 vmovdqa HashKey_3_k(arg1), \T3
1068 vpclmulqdq $0x00, \T3, \T2, \T2
1069 vpxor \T2, \XMM1, \XMM1
1070
1071 ######################
1072
1073 vpshufd $0b01001110, \XMM7, \T2
1074 vpxor \XMM7, \T2, \T2
1075 vmovdqa HashKey_2(arg1), \T5
1076 vpclmulqdq $0x11, \T5, \XMM7, \T4
1077 vpxor \T4, \T6, \T6
1078
1079 vpclmulqdq $0x00, \T5, \XMM7, \T4
1080 vpxor \T4, \T7, \T7
1081
1082 vmovdqa HashKey_2_k(arg1), \T3
1083 vpclmulqdq $0x00, \T3, \T2, \T2
1084 vpxor \T2, \XMM1, \XMM1
1085
1086 ######################
1087
1088 vpshufd $0b01001110, \XMM8, \T2
1089 vpxor \XMM8, \T2, \T2
1090 vmovdqa HashKey(arg1), \T5
1091 vpclmulqdq $0x11, \T5, \XMM8, \T4
1092 vpxor \T4, \T6, \T6
1093
1094 vpclmulqdq $0x00, \T5, \XMM8, \T4
1095 vpxor \T4, \T7, \T7
1096
1097 vmovdqa HashKey_k(arg1), \T3
1098 vpclmulqdq $0x00, \T3, \T2, \T2
1099
1100 vpxor \T2, \XMM1, \XMM1
1101 vpxor \T6, \XMM1, \XMM1
1102 vpxor \T7, \XMM1, \T2
1103
1104
1105
1106
1107 vpslldq $8, \T2, \T4
1108 vpsrldq $8, \T2, \T2
1109
1110 vpxor \T4, \T7, \T7
1111 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1112 # the accumulated carry-less multiplications
1113
1114 #######################################################################
1115 #first phase of the reduction
1116 vpslld $31, \T7, \T2 # packed right shifting << 31
1117 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1118 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1119
1120 vpxor \T3, \T2, \T2 # xor the shifted versions
1121 vpxor \T4, \T2, \T2
1122
1123 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1124
1125 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1126 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1127 #######################################################################
1128
1129
1130 #second phase of the reduction
1131 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1132 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1133 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1134 vpxor \T3, \T2, \T2 # xor the shifted versions
1135 vpxor \T4, \T2, \T2
1136
1137 vpxor \T1, \T2, \T2
1138 vpxor \T2, \T7, \T7
1139 vpxor \T7, \T6, \T6 # the result is in T6
1140
1141.endm
1142
1143
1144# combined for GCM encrypt and decrypt functions
1145# clobbering all xmm registers
1146# clobbering r10, r11, r12, r13, r14, r15
1147.macro GCM_ENC_DEC_AVX ENC_DEC
1148
1149 #the number of pushes must equal STACK_OFFSET
1150 push %r12
1151 push %r13
1152 push %r14
1153 push %r15
1154
1155 mov %rsp, %r14
1156
1157
1158
1159
1160 sub $VARIABLE_OFFSET, %rsp
1161 and $~63, %rsp # align rsp to 64 bytes
1162
1163
1164 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
1165
1166 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
1167 and $-16, %r13 # r13 = r13 - (r13 mod 16)
1168
1169 mov %r13, %r12
1170 shr $4, %r12
1171 and $7, %r12
1172 jz _initial_num_blocks_is_0\@
1173
1174 cmp $7, %r12
1175 je _initial_num_blocks_is_7\@
1176 cmp $6, %r12
1177 je _initial_num_blocks_is_6\@
1178 cmp $5, %r12
1179 je _initial_num_blocks_is_5\@
1180 cmp $4, %r12
1181 je _initial_num_blocks_is_4\@
1182 cmp $3, %r12
1183 je _initial_num_blocks_is_3\@
1184 cmp $2, %r12
1185 je _initial_num_blocks_is_2\@
1186
1187 jmp _initial_num_blocks_is_1\@
1188
1189_initial_num_blocks_is_7\@:
1190 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1191 sub $16*7, %r13
1192 jmp _initial_blocks_encrypted\@
1193
1194_initial_num_blocks_is_6\@:
1195 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1196 sub $16*6, %r13
1197 jmp _initial_blocks_encrypted\@
1198
1199_initial_num_blocks_is_5\@:
1200 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1201 sub $16*5, %r13
1202 jmp _initial_blocks_encrypted\@
1203
1204_initial_num_blocks_is_4\@:
1205 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1206 sub $16*4, %r13
1207 jmp _initial_blocks_encrypted\@
1208
1209_initial_num_blocks_is_3\@:
1210 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1211 sub $16*3, %r13
1212 jmp _initial_blocks_encrypted\@
1213
1214_initial_num_blocks_is_2\@:
1215 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1216 sub $16*2, %r13
1217 jmp _initial_blocks_encrypted\@
1218
1219_initial_num_blocks_is_1\@:
1220 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1221 sub $16*1, %r13
1222 jmp _initial_blocks_encrypted\@
1223
1224_initial_num_blocks_is_0\@:
1225 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1226
1227
1228_initial_blocks_encrypted\@:
1229 cmp $0, %r13
1230 je _zero_cipher_left\@
1231
1232 sub $128, %r13
1233 je _eight_cipher_left\@
1234
1235
1236
1237
1238 vmovd %xmm9, %r15d
1239 and $255, %r15d
1240 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1241
1242
1243_encrypt_by_8_new\@:
1244 cmp $(255-8), %r15d
1245 jg _encrypt_by_8\@
1246
1247
1248
1249 add $8, %r15b
1250 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1251 add $128, %r11
1252 sub $128, %r13
1253 jne _encrypt_by_8_new\@
1254
1255 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1256 jmp _eight_cipher_left\@
1257
1258_encrypt_by_8\@:
1259 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1260 add $8, %r15b
1261 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1262 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1263 add $128, %r11
1264 sub $128, %r13
1265 jne _encrypt_by_8_new\@
1266
1267 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1268
1269
1270
1271
1272_eight_cipher_left\@:
1273 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1274
1275
1276_zero_cipher_left\@:
1277 cmp $16, arg4
1278 jl _only_less_than_16\@
1279
1280 mov arg4, %r13
1281 and $15, %r13 # r13 = (arg4 mod 16)
1282
1283 je _multiple_of_16_bytes\@
1284
1285 # handle the last <16 Byte block seperately
1286
1287
1288 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1289 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1290 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1291
1292 sub $16, %r11
1293 add %r13, %r11
1294 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
1295
1296 lea SHIFT_MASK+16(%rip), %r12
1297 sub %r13, %r12 # adjust the shuffle mask pointer to be
1298 # able to shift 16-r13 bytes (r13 is the
1299 # number of bytes in plaintext mod 16)
1300 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
1301 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
1302 jmp _final_ghash_mul\@
1303
1304_only_less_than_16\@:
1305 # check for 0 length
1306 mov arg4, %r13
1307 and $15, %r13 # r13 = (arg4 mod 16)
1308
1309 je _multiple_of_16_bytes\@
1310
1311 # handle the last <16 Byte block seperately
1312
1313
1314 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1315 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1316 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1317
1318
1319 lea SHIFT_MASK+16(%rip), %r12
1320 sub %r13, %r12 # adjust the shuffle mask pointer to be
1321 # able to shift 16-r13 bytes (r13 is the
1322 # number of bytes in plaintext mod 16)
1323
1324_get_last_16_byte_loop\@:
1325 movb (arg3, %r11), %al
1326 movb %al, TMP1 (%rsp , %r11)
1327 add $1, %r11
1328 cmp %r13, %r11
1329 jne _get_last_16_byte_loop\@
1330
1331 vmovdqu TMP1(%rsp), %xmm1
1332
1333 sub $16, %r11
1334
1335_final_ghash_mul\@:
1336 .if \ENC_DEC == DEC
1337 vmovdqa %xmm1, %xmm2
1338 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1339 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1340 # mask out top 16-r13 bytes of xmm9
1341 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1342 vpand %xmm1, %xmm2, %xmm2
1343 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1344 vpxor %xmm2, %xmm14, %xmm14
1345 #GHASH computation for the last <16 Byte block
1346 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1347 sub %r13, %r11
1348 add $16, %r11
1349 .else
1350 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1351 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1352 # mask out top 16-r13 bytes of xmm9
1353 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1354 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1355 vpxor %xmm9, %xmm14, %xmm14
1356 #GHASH computation for the last <16 Byte block
1357 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1358 sub %r13, %r11
1359 add $16, %r11
1360 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
1361 .endif
1362
1363
1364 #############################
1365 # output r13 Bytes
1366 vmovq %xmm9, %rax
1367 cmp $8, %r13
1368 jle _less_than_8_bytes_left\@
1369
1370 mov %rax, (arg2 , %r11)
1371 add $8, %r11
1372 vpsrldq $8, %xmm9, %xmm9
1373 vmovq %xmm9, %rax
1374 sub $8, %r13
1375
1376_less_than_8_bytes_left\@:
1377 movb %al, (arg2 , %r11)
1378 add $1, %r11
1379 shr $8, %rax
1380 sub $1, %r13
1381 jne _less_than_8_bytes_left\@
1382 #############################
1383
1384_multiple_of_16_bytes\@:
1385 mov arg7, %r12 # r12 = aadLen (number of bytes)
1386 shl $3, %r12 # convert into number of bits
1387 vmovd %r12d, %xmm15 # len(A) in xmm15
1388
1389 shl $3, arg4 # len(C) in bits (*128)
1390 vmovq arg4, %xmm1
1391 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
1392 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
1393
1394 vpxor %xmm15, %xmm14, %xmm14
1395 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
1396 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
1397
1398 mov arg5, %rax # rax = *Y0
1399 vmovdqu (%rax), %xmm9 # xmm9 = Y0
1400
1401 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
1402
1403 vpxor %xmm14, %xmm9, %xmm9
1404
1405
1406
1407_return_T\@:
1408 mov arg8, %r10 # r10 = authTag
1409 mov arg9, %r11 # r11 = auth_tag_len
1410
1411 cmp $16, %r11
1412 je _T_16\@
1413
1414 cmp $12, %r11
1415 je _T_12\@
1416
1417_T_8\@:
1418 vmovq %xmm9, %rax
1419 mov %rax, (%r10)
1420 jmp _return_T_done\@
1421_T_12\@:
1422 vmovq %xmm9, %rax
1423 mov %rax, (%r10)
1424 vpsrldq $8, %xmm9, %xmm9
1425 vmovd %xmm9, %eax
1426 mov %eax, 8(%r10)
1427 jmp _return_T_done\@
1428
1429_T_16\@:
1430 vmovdqu %xmm9, (%r10)
1431
1432_return_T_done\@:
1433 mov %r14, %rsp
1434
1435 pop %r15
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439.endm
1440
1441
1442#############################################################
1443#void aesni_gcm_precomp_avx_gen2
1444# (gcm_data *my_ctx_data,
1445# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1446#############################################################
1447ENTRY(aesni_gcm_precomp_avx_gen2)
1448 #the number of pushes must equal STACK_OFFSET
1449 push %r12
1450 push %r13
1451 push %r14
1452 push %r15
1453
1454 mov %rsp, %r14
1455
1456
1457
1458 sub $VARIABLE_OFFSET, %rsp
1459 and $~63, %rsp # align rsp to 64 bytes
1460
1461 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1462
1463 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1464 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1465 vmovdqa %xmm6, %xmm2
1466 vpsllq $1, %xmm6, %xmm6
1467 vpsrlq $63, %xmm2, %xmm2
1468 vmovdqa %xmm2, %xmm1
1469 vpslldq $8, %xmm2, %xmm2
1470 vpsrldq $8, %xmm1, %xmm1
1471 vpor %xmm2, %xmm6, %xmm6
1472 #reduction
1473 vpshufd $0b00100100, %xmm1, %xmm2
1474 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1475 vpand POLY(%rip), %xmm2, %xmm2
1476 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1477 #######################################################################
1478 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1479
1480
1481 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1482
1483 mov %r14, %rsp
1484
1485 pop %r15
1486 pop %r14
1487 pop %r13
1488 pop %r12
1489 ret
1490ENDPROC(aesni_gcm_precomp_avx_gen2)
1491
1492###############################################################################
1493#void aesni_gcm_enc_avx_gen2(
1494# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1495# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1496# const u8 *in, /* Plaintext input */
1497# u64 plaintext_len, /* Length of data in Bytes for encryption. */
1498# u8 *iv, /* Pre-counter block j0: 4 byte salt
1499# (from Security Association) concatenated with 8 byte
1500# Initialisation Vector (from IPSec ESP Payload)
1501# concatenated with 0x00000001. 16-byte aligned pointer. */
1502# const u8 *aad, /* Additional Authentication Data (AAD)*/
1503# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1504# u8 *auth_tag, /* Authenticated Tag output. */
1505# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1506# Valid values are 16 (most likely), 12 or 8. */
1507###############################################################################
1508ENTRY(aesni_gcm_enc_avx_gen2)
1509 GCM_ENC_DEC_AVX ENC
1510 ret
1511ENDPROC(aesni_gcm_enc_avx_gen2)
1512
1513###############################################################################
1514#void aesni_gcm_dec_avx_gen2(
1515# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1516# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1517# const u8 *in, /* Ciphertext input */
1518# u64 plaintext_len, /* Length of data in Bytes for encryption. */
1519# u8 *iv, /* Pre-counter block j0: 4 byte salt
1520# (from Security Association) concatenated with 8 byte
1521# Initialisation Vector (from IPSec ESP Payload)
1522# concatenated with 0x00000001. 16-byte aligned pointer. */
1523# const u8 *aad, /* Additional Authentication Data (AAD)*/
1524# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1525# u8 *auth_tag, /* Authenticated Tag output. */
1526# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1527# Valid values are 16 (most likely), 12 or 8. */
1528###############################################################################
1529ENTRY(aesni_gcm_dec_avx_gen2)
1530 GCM_ENC_DEC_AVX DEC
1531 ret
1532ENDPROC(aesni_gcm_dec_avx_gen2)
1533#endif /* CONFIG_AS_AVX */
1534
1535#ifdef CONFIG_AS_AVX2
1536###############################################################################
1537# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1538# Input: A and B (128-bits each, bit-reflected)
1539# Output: C = A*B*x mod poly, (i.e. >>1 )
1540# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1541# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1542###############################################################################
1543.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1544
1545 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1546 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1547 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1548 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1549 vpxor \T3, \GH, \GH
1550
1551
1552 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1553 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1554
1555 vpxor \T3, \T1, \T1
1556 vpxor \T2, \GH, \GH
1557
1558 #######################################################################
1559 #first phase of the reduction
1560 vmovdqa POLY2(%rip), \T3
1561
1562 vpclmulqdq $0x01, \GH, \T3, \T2
1563 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1564
1565 vpxor \T2, \GH, \GH # first phase of the reduction complete
1566 #######################################################################
1567 #second phase of the reduction
1568 vpclmulqdq $0x00, \GH, \T3, \T2
1569 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1570
1571 vpclmulqdq $0x10, \GH, \T3, \GH
1572 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1573
1574 vpxor \T2, \GH, \GH # second phase of the reduction complete
1575 #######################################################################
1576 vpxor \T1, \GH, \GH # the result is in GH
1577
1578
1579.endm
1580
1581.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1582
1583 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1584 vmovdqa \HK, \T5
1585 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1586 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1587
1588 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1589 vmovdqa \T5, HashKey_3(arg1)
1590
1591 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1592 vmovdqa \T5, HashKey_4(arg1)
1593
1594 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1595 vmovdqa \T5, HashKey_5(arg1)
1596
1597 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1598 vmovdqa \T5, HashKey_6(arg1)
1599
1600 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1601 vmovdqa \T5, HashKey_7(arg1)
1602
1603 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1604 vmovdqa \T5, HashKey_8(arg1)
1605
1606.endm
1607
1608
1609## if a = number of total plaintext bytes
1610## b = floor(a/16)
1611## num_initial_blocks = b mod 4#
1612## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1613## r10, r11, r12, rax are clobbered
1614## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1615
1616.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1617 i = (8-\num_initial_blocks)
1618 setreg
1619
1620 mov arg6, %r10 # r10 = AAD
1621 mov arg7, %r12 # r12 = aadLen
1622
1623
1624 mov %r12, %r11
1625
1626 vpxor reg_i, reg_i, reg_i
1627_get_AAD_loop\@:
1628 vmovd (%r10), \T1
1629 vpslldq $12, \T1, \T1
1630 vpsrldq $4, reg_i, reg_i
1631 vpxor \T1, reg_i, reg_i
1632
1633 add $4, %r10
1634 sub $4, %r12
1635 jg _get_AAD_loop\@
1636
1637
1638 cmp $16, %r11
1639 je _get_AAD_loop2_done\@
1640 mov $16, %r12
1641
1642_get_AAD_loop2\@:
1643 vpsrldq $4, reg_i, reg_i
1644 sub $4, %r12
1645 cmp %r11, %r12
1646 jg _get_AAD_loop2\@
1647
1648_get_AAD_loop2_done\@:
1649
1650 #byte-reflect the AAD data
1651 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1652
1653 # initialize the data pointer offset as zero
1654 xor %r11, %r11
1655
1656 # start AES for num_initial_blocks blocks
1657 mov arg5, %rax # rax = *Y0
1658 vmovdqu (%rax), \CTR # CTR = Y0
1659 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1660
1661
1662 i = (9-\num_initial_blocks)
1663 setreg
1664.rep \num_initial_blocks
1665 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1666 vmovdqa \CTR, reg_i
1667 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1668 i = (i+1)
1669 setreg
1670.endr
1671
1672 vmovdqa (arg1), \T_key
1673 i = (9-\num_initial_blocks)
1674 setreg
1675.rep \num_initial_blocks
1676 vpxor \T_key, reg_i, reg_i
1677 i = (i+1)
1678 setreg
1679.endr
1680
1681 j = 1
1682 setreg
1683.rep 9
1684 vmovdqa 16*j(arg1), \T_key
1685 i = (9-\num_initial_blocks)
1686 setreg
1687.rep \num_initial_blocks
1688 vaesenc \T_key, reg_i, reg_i
1689 i = (i+1)
1690 setreg
1691.endr
1692
1693 j = (j+1)
1694 setreg
1695.endr
1696
1697
1698 vmovdqa 16*10(arg1), \T_key
1699 i = (9-\num_initial_blocks)
1700 setreg
1701.rep \num_initial_blocks
1702 vaesenclast \T_key, reg_i, reg_i
1703 i = (i+1)
1704 setreg
1705.endr
1706
1707 i = (9-\num_initial_blocks)
1708 setreg
1709.rep \num_initial_blocks
1710 vmovdqu (arg3, %r11), \T1
1711 vpxor \T1, reg_i, reg_i
1712 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
1713 # num_initial_blocks blocks
1714 add $16, %r11
1715.if \ENC_DEC == DEC
1716 vmovdqa \T1, reg_i
1717.endif
1718 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1719 i = (i+1)
1720 setreg
1721.endr
1722
1723
1724 i = (8-\num_initial_blocks)
1725 j = (9-\num_initial_blocks)
1726 setreg
1727 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1728
1729.rep \num_initial_blocks
1730 vpxor reg_i, reg_j, reg_j
1731 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1732 i = (i+1)
1733 j = (j+1)
1734 setreg
1735.endr
1736 # XMM8 has the combined result here
1737
1738 vmovdqa \XMM8, TMP1(%rsp)
1739 vmovdqa \XMM8, \T3
1740
1741 cmp $128, %r13
1742 jl _initial_blocks_done\@ # no need for precomputed constants
1743
1744###############################################################################
1745# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1746 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1747 vmovdqa \CTR, \XMM1
1748 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1749
1750 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1751 vmovdqa \CTR, \XMM2
1752 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1753
1754 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1755 vmovdqa \CTR, \XMM3
1756 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1757
1758 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1759 vmovdqa \CTR, \XMM4
1760 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1761
1762 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1763 vmovdqa \CTR, \XMM5
1764 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1765
1766 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1767 vmovdqa \CTR, \XMM6
1768 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1769
1770 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1771 vmovdqa \CTR, \XMM7
1772 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1773
1774 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1775 vmovdqa \CTR, \XMM8
1776 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1777
1778 vmovdqa (arg1), \T_key
1779 vpxor \T_key, \XMM1, \XMM1
1780 vpxor \T_key, \XMM2, \XMM2
1781 vpxor \T_key, \XMM3, \XMM3
1782 vpxor \T_key, \XMM4, \XMM4
1783 vpxor \T_key, \XMM5, \XMM5
1784 vpxor \T_key, \XMM6, \XMM6
1785 vpxor \T_key, \XMM7, \XMM7
1786 vpxor \T_key, \XMM8, \XMM8
1787
1788 i = 1
1789 setreg
1790.rep 9 # do 9 rounds
1791 vmovdqa 16*i(arg1), \T_key
1792 vaesenc \T_key, \XMM1, \XMM1
1793 vaesenc \T_key, \XMM2, \XMM2
1794 vaesenc \T_key, \XMM3, \XMM3
1795 vaesenc \T_key, \XMM4, \XMM4
1796 vaesenc \T_key, \XMM5, \XMM5
1797 vaesenc \T_key, \XMM6, \XMM6
1798 vaesenc \T_key, \XMM7, \XMM7
1799 vaesenc \T_key, \XMM8, \XMM8
1800 i = (i+1)
1801 setreg
1802.endr
1803
1804
1805 vmovdqa 16*i(arg1), \T_key
1806 vaesenclast \T_key, \XMM1, \XMM1
1807 vaesenclast \T_key, \XMM2, \XMM2
1808 vaesenclast \T_key, \XMM3, \XMM3
1809 vaesenclast \T_key, \XMM4, \XMM4
1810 vaesenclast \T_key, \XMM5, \XMM5
1811 vaesenclast \T_key, \XMM6, \XMM6
1812 vaesenclast \T_key, \XMM7, \XMM7
1813 vaesenclast \T_key, \XMM8, \XMM8
1814
1815 vmovdqu (arg3, %r11), \T1
1816 vpxor \T1, \XMM1, \XMM1
1817 vmovdqu \XMM1, (arg2 , %r11)
1818 .if \ENC_DEC == DEC
1819 vmovdqa \T1, \XMM1
1820 .endif
1821
1822 vmovdqu 16*1(arg3, %r11), \T1
1823 vpxor \T1, \XMM2, \XMM2
1824 vmovdqu \XMM2, 16*1(arg2 , %r11)
1825 .if \ENC_DEC == DEC
1826 vmovdqa \T1, \XMM2
1827 .endif
1828
1829 vmovdqu 16*2(arg3, %r11), \T1
1830 vpxor \T1, \XMM3, \XMM3
1831 vmovdqu \XMM3, 16*2(arg2 , %r11)
1832 .if \ENC_DEC == DEC
1833 vmovdqa \T1, \XMM3
1834 .endif
1835
1836 vmovdqu 16*3(arg3, %r11), \T1
1837 vpxor \T1, \XMM4, \XMM4
1838 vmovdqu \XMM4, 16*3(arg2 , %r11)
1839 .if \ENC_DEC == DEC
1840 vmovdqa \T1, \XMM4
1841 .endif
1842
1843 vmovdqu 16*4(arg3, %r11), \T1
1844 vpxor \T1, \XMM5, \XMM5
1845 vmovdqu \XMM5, 16*4(arg2 , %r11)
1846 .if \ENC_DEC == DEC
1847 vmovdqa \T1, \XMM5
1848 .endif
1849
1850 vmovdqu 16*5(arg3, %r11), \T1
1851 vpxor \T1, \XMM6, \XMM6
1852 vmovdqu \XMM6, 16*5(arg2 , %r11)
1853 .if \ENC_DEC == DEC
1854 vmovdqa \T1, \XMM6
1855 .endif
1856
1857 vmovdqu 16*6(arg3, %r11), \T1
1858 vpxor \T1, \XMM7, \XMM7
1859 vmovdqu \XMM7, 16*6(arg2 , %r11)
1860 .if \ENC_DEC == DEC
1861 vmovdqa \T1, \XMM7
1862 .endif
1863
1864 vmovdqu 16*7(arg3, %r11), \T1
1865 vpxor \T1, \XMM8, \XMM8
1866 vmovdqu \XMM8, 16*7(arg2 , %r11)
1867 .if \ENC_DEC == DEC
1868 vmovdqa \T1, \XMM8
1869 .endif
1870
1871 add $128, %r11
1872
1873 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1874 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1875 # the corresponding ciphertext
1876 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1877 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1879 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1880 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1881 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1882 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1883
1884###############################################################################
1885
1886_initial_blocks_done\@:
1887
1888
1889.endm
1890
1891
1892
1893# encrypt 8 blocks at a time
1894# ghash the 8 previously encrypted ciphertext blocks
1895# arg1, arg2, arg3 are used as pointers only, not modified
1896# r11 is the data offset value
1897.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1898
1899 vmovdqa \XMM1, \T2
1900 vmovdqa \XMM2, TMP2(%rsp)
1901 vmovdqa \XMM3, TMP3(%rsp)
1902 vmovdqa \XMM4, TMP4(%rsp)
1903 vmovdqa \XMM5, TMP5(%rsp)
1904 vmovdqa \XMM6, TMP6(%rsp)
1905 vmovdqa \XMM7, TMP7(%rsp)
1906 vmovdqa \XMM8, TMP8(%rsp)
1907
1908.if \loop_idx == in_order
1909 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1910 vpaddd ONE(%rip), \XMM1, \XMM2
1911 vpaddd ONE(%rip), \XMM2, \XMM3
1912 vpaddd ONE(%rip), \XMM3, \XMM4
1913 vpaddd ONE(%rip), \XMM4, \XMM5
1914 vpaddd ONE(%rip), \XMM5, \XMM6
1915 vpaddd ONE(%rip), \XMM6, \XMM7
1916 vpaddd ONE(%rip), \XMM7, \XMM8
1917 vmovdqa \XMM8, \CTR
1918
1919 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1920 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1921 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1922 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1923 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1924 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1925 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1926 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1927.else
1928 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1929 vpaddd ONEf(%rip), \XMM1, \XMM2
1930 vpaddd ONEf(%rip), \XMM2, \XMM3
1931 vpaddd ONEf(%rip), \XMM3, \XMM4
1932 vpaddd ONEf(%rip), \XMM4, \XMM5
1933 vpaddd ONEf(%rip), \XMM5, \XMM6
1934 vpaddd ONEf(%rip), \XMM6, \XMM7
1935 vpaddd ONEf(%rip), \XMM7, \XMM8
1936 vmovdqa \XMM8, \CTR
1937.endif
1938
1939
1940 #######################################################################
1941
1942 vmovdqu (arg1), \T1
1943 vpxor \T1, \XMM1, \XMM1
1944 vpxor \T1, \XMM2, \XMM2
1945 vpxor \T1, \XMM3, \XMM3
1946 vpxor \T1, \XMM4, \XMM4
1947 vpxor \T1, \XMM5, \XMM5
1948 vpxor \T1, \XMM6, \XMM6
1949 vpxor \T1, \XMM7, \XMM7
1950 vpxor \T1, \XMM8, \XMM8
1951
1952 #######################################################################
1953
1954
1955
1956
1957
1958 vmovdqu 16*1(arg1), \T1
1959 vaesenc \T1, \XMM1, \XMM1
1960 vaesenc \T1, \XMM2, \XMM2
1961 vaesenc \T1, \XMM3, \XMM3
1962 vaesenc \T1, \XMM4, \XMM4
1963 vaesenc \T1, \XMM5, \XMM5
1964 vaesenc \T1, \XMM6, \XMM6
1965 vaesenc \T1, \XMM7, \XMM7
1966 vaesenc \T1, \XMM8, \XMM8
1967
1968 vmovdqu 16*2(arg1), \T1
1969 vaesenc \T1, \XMM1, \XMM1
1970 vaesenc \T1, \XMM2, \XMM2
1971 vaesenc \T1, \XMM3, \XMM3
1972 vaesenc \T1, \XMM4, \XMM4
1973 vaesenc \T1, \XMM5, \XMM5
1974 vaesenc \T1, \XMM6, \XMM6
1975 vaesenc \T1, \XMM7, \XMM7
1976 vaesenc \T1, \XMM8, \XMM8
1977
1978
1979 #######################################################################
1980
1981 vmovdqa HashKey_8(arg1), \T5
1982 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1983 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1984 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
1985 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
1986 vpxor \T5, \T6, \T6
1987
1988 vmovdqu 16*3(arg1), \T1
1989 vaesenc \T1, \XMM1, \XMM1
1990 vaesenc \T1, \XMM2, \XMM2
1991 vaesenc \T1, \XMM3, \XMM3
1992 vaesenc \T1, \XMM4, \XMM4
1993 vaesenc \T1, \XMM5, \XMM5
1994 vaesenc \T1, \XMM6, \XMM6
1995 vaesenc \T1, \XMM7, \XMM7
1996 vaesenc \T1, \XMM8, \XMM8
1997
1998 vmovdqa TMP2(%rsp), \T1
1999 vmovdqa HashKey_7(arg1), \T5
2000 vpclmulqdq $0x11, \T5, \T1, \T3
2001 vpxor \T3, \T4, \T4
2002
2003 vpclmulqdq $0x00, \T5, \T1, \T3
2004 vpxor \T3, \T7, \T7
2005
2006 vpclmulqdq $0x01, \T5, \T1, \T3
2007 vpxor \T3, \T6, \T6
2008
2009 vpclmulqdq $0x10, \T5, \T1, \T3
2010 vpxor \T3, \T6, \T6
2011
2012 vmovdqu 16*4(arg1), \T1
2013 vaesenc \T1, \XMM1, \XMM1
2014 vaesenc \T1, \XMM2, \XMM2
2015 vaesenc \T1, \XMM3, \XMM3
2016 vaesenc \T1, \XMM4, \XMM4
2017 vaesenc \T1, \XMM5, \XMM5
2018 vaesenc \T1, \XMM6, \XMM6
2019 vaesenc \T1, \XMM7, \XMM7
2020 vaesenc \T1, \XMM8, \XMM8
2021
2022 #######################################################################
2023
2024 vmovdqa TMP3(%rsp), \T1
2025 vmovdqa HashKey_6(arg1), \T5
2026 vpclmulqdq $0x11, \T5, \T1, \T3
2027 vpxor \T3, \T4, \T4
2028
2029 vpclmulqdq $0x00, \T5, \T1, \T3
2030 vpxor \T3, \T7, \T7
2031
2032 vpclmulqdq $0x01, \T5, \T1, \T3
2033 vpxor \T3, \T6, \T6
2034
2035 vpclmulqdq $0x10, \T5, \T1, \T3
2036 vpxor \T3, \T6, \T6
2037
2038 vmovdqu 16*5(arg1), \T1
2039 vaesenc \T1, \XMM1, \XMM1
2040 vaesenc \T1, \XMM2, \XMM2
2041 vaesenc \T1, \XMM3, \XMM3
2042 vaesenc \T1, \XMM4, \XMM4
2043 vaesenc \T1, \XMM5, \XMM5
2044 vaesenc \T1, \XMM6, \XMM6
2045 vaesenc \T1, \XMM7, \XMM7
2046 vaesenc \T1, \XMM8, \XMM8
2047
2048 vmovdqa TMP4(%rsp), \T1
2049 vmovdqa HashKey_5(arg1), \T5
2050 vpclmulqdq $0x11, \T5, \T1, \T3
2051 vpxor \T3, \T4, \T4
2052
2053 vpclmulqdq $0x00, \T5, \T1, \T3
2054 vpxor \T3, \T7, \T7
2055
2056 vpclmulqdq $0x01, \T5, \T1, \T3
2057 vpxor \T3, \T6, \T6
2058
2059 vpclmulqdq $0x10, \T5, \T1, \T3
2060 vpxor \T3, \T6, \T6
2061
2062 vmovdqu 16*6(arg1), \T1
2063 vaesenc \T1, \XMM1, \XMM1
2064 vaesenc \T1, \XMM2, \XMM2
2065 vaesenc \T1, \XMM3, \XMM3
2066 vaesenc \T1, \XMM4, \XMM4
2067 vaesenc \T1, \XMM5, \XMM5
2068 vaesenc \T1, \XMM6, \XMM6
2069 vaesenc \T1, \XMM7, \XMM7
2070 vaesenc \T1, \XMM8, \XMM8
2071
2072
2073 vmovdqa TMP5(%rsp), \T1
2074 vmovdqa HashKey_4(arg1), \T5
2075 vpclmulqdq $0x11, \T5, \T1, \T3
2076 vpxor \T3, \T4, \T4
2077
2078 vpclmulqdq $0x00, \T5, \T1, \T3
2079 vpxor \T3, \T7, \T7
2080
2081 vpclmulqdq $0x01, \T5, \T1, \T3
2082 vpxor \T3, \T6, \T6
2083
2084 vpclmulqdq $0x10, \T5, \T1, \T3
2085 vpxor \T3, \T6, \T6
2086
2087 vmovdqu 16*7(arg1), \T1
2088 vaesenc \T1, \XMM1, \XMM1
2089 vaesenc \T1, \XMM2, \XMM2
2090 vaesenc \T1, \XMM3, \XMM3
2091 vaesenc \T1, \XMM4, \XMM4
2092 vaesenc \T1, \XMM5, \XMM5
2093 vaesenc \T1, \XMM6, \XMM6
2094 vaesenc \T1, \XMM7, \XMM7
2095 vaesenc \T1, \XMM8, \XMM8
2096
2097 vmovdqa TMP6(%rsp), \T1
2098 vmovdqa HashKey_3(arg1), \T5
2099 vpclmulqdq $0x11, \T5, \T1, \T3
2100 vpxor \T3, \T4, \T4
2101
2102 vpclmulqdq $0x00, \T5, \T1, \T3
2103 vpxor \T3, \T7, \T7
2104
2105 vpclmulqdq $0x01, \T5, \T1, \T3
2106 vpxor \T3, \T6, \T6
2107
2108 vpclmulqdq $0x10, \T5, \T1, \T3
2109 vpxor \T3, \T6, \T6
2110
2111 vmovdqu 16*8(arg1), \T1
2112 vaesenc \T1, \XMM1, \XMM1
2113 vaesenc \T1, \XMM2, \XMM2
2114 vaesenc \T1, \XMM3, \XMM3
2115 vaesenc \T1, \XMM4, \XMM4
2116 vaesenc \T1, \XMM5, \XMM5
2117 vaesenc \T1, \XMM6, \XMM6
2118 vaesenc \T1, \XMM7, \XMM7
2119 vaesenc \T1, \XMM8, \XMM8
2120
2121 vmovdqa TMP7(%rsp), \T1
2122 vmovdqa HashKey_2(arg1), \T5
2123 vpclmulqdq $0x11, \T5, \T1, \T3
2124 vpxor \T3, \T4, \T4
2125
2126 vpclmulqdq $0x00, \T5, \T1, \T3
2127 vpxor \T3, \T7, \T7
2128
2129 vpclmulqdq $0x01, \T5, \T1, \T3
2130 vpxor \T3, \T6, \T6
2131
2132 vpclmulqdq $0x10, \T5, \T1, \T3
2133 vpxor \T3, \T6, \T6
2134
2135
2136 #######################################################################
2137
2138 vmovdqu 16*9(arg1), \T5
2139 vaesenc \T5, \XMM1, \XMM1
2140 vaesenc \T5, \XMM2, \XMM2
2141 vaesenc \T5, \XMM3, \XMM3
2142 vaesenc \T5, \XMM4, \XMM4
2143 vaesenc \T5, \XMM5, \XMM5
2144 vaesenc \T5, \XMM6, \XMM6
2145 vaesenc \T5, \XMM7, \XMM7
2146 vaesenc \T5, \XMM8, \XMM8
2147
2148 vmovdqa TMP8(%rsp), \T1
2149 vmovdqa HashKey(arg1), \T5
2150
2151 vpclmulqdq $0x00, \T5, \T1, \T3
2152 vpxor \T3, \T7, \T7
2153
2154 vpclmulqdq $0x01, \T5, \T1, \T3
2155 vpxor \T3, \T6, \T6
2156
2157 vpclmulqdq $0x10, \T5, \T1, \T3
2158 vpxor \T3, \T6, \T6
2159
2160 vpclmulqdq $0x11, \T5, \T1, \T3
2161 vpxor \T3, \T4, \T1
2162
2163
2164 vmovdqu 16*10(arg1), \T5
2165
2166 i = 0
2167 j = 1
2168 setreg
2169.rep 8
2170 vpxor 16*i(arg3, %r11), \T5, \T2
2171 .if \ENC_DEC == ENC
2172 vaesenclast \T2, reg_j, reg_j
2173 .else
2174 vaesenclast \T2, reg_j, \T3
2175 vmovdqu 16*i(arg3, %r11), reg_j
2176 vmovdqu \T3, 16*i(arg2, %r11)
2177 .endif
2178 i = (i+1)
2179 j = (j+1)
2180 setreg
2181.endr
2182 #######################################################################
2183
2184
2185 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2186 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2187 vpxor \T3, \T7, \T7
2188 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2189
2190
2191
2192 #######################################################################
2193 #first phase of the reduction
2194 vmovdqa POLY2(%rip), \T3
2195
2196 vpclmulqdq $0x01, \T7, \T3, \T2
2197 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2198
2199 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2200 #######################################################################
2201 .if \ENC_DEC == ENC
2202 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2203 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2204 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2205 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2206 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2207 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2208 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2209 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2210 .endif
2211
2212 #######################################################################
2213 #second phase of the reduction
2214 vpclmulqdq $0x00, \T7, \T3, \T2
2215 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2216
2217 vpclmulqdq $0x10, \T7, \T3, \T4
2218 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2219
2220 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2221 #######################################################################
2222 vpxor \T4, \T1, \T1 # the result is in T1
2223
2224 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2225 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2226 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2227 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2228 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2229 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2230 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2231 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2232
2233
2234 vpxor \T1, \XMM1, \XMM1
2235
2236
2237
2238.endm
2239
2240
2241# GHASH the last 4 ciphertext blocks.
2242.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2243
2244 ## Karatsuba Method
2245
2246 vmovdqa HashKey_8(arg1), \T5
2247
2248 vpshufd $0b01001110, \XMM1, \T2
2249 vpshufd $0b01001110, \T5, \T3
2250 vpxor \XMM1, \T2, \T2
2251 vpxor \T5, \T3, \T3
2252
2253 vpclmulqdq $0x11, \T5, \XMM1, \T6
2254 vpclmulqdq $0x00, \T5, \XMM1, \T7
2255
2256 vpclmulqdq $0x00, \T3, \T2, \XMM1
2257
2258 ######################
2259
2260 vmovdqa HashKey_7(arg1), \T5
2261 vpshufd $0b01001110, \XMM2, \T2
2262 vpshufd $0b01001110, \T5, \T3
2263 vpxor \XMM2, \T2, \T2
2264 vpxor \T5, \T3, \T3
2265
2266 vpclmulqdq $0x11, \T5, \XMM2, \T4
2267 vpxor \T4, \T6, \T6
2268
2269 vpclmulqdq $0x00, \T5, \XMM2, \T4
2270 vpxor \T4, \T7, \T7
2271
2272 vpclmulqdq $0x00, \T3, \T2, \T2
2273
2274 vpxor \T2, \XMM1, \XMM1
2275
2276 ######################
2277
2278 vmovdqa HashKey_6(arg1), \T5
2279 vpshufd $0b01001110, \XMM3, \T2
2280 vpshufd $0b01001110, \T5, \T3
2281 vpxor \XMM3, \T2, \T2
2282 vpxor \T5, \T3, \T3
2283
2284 vpclmulqdq $0x11, \T5, \XMM3, \T4
2285 vpxor \T4, \T6, \T6
2286
2287 vpclmulqdq $0x00, \T5, \XMM3, \T4
2288 vpxor \T4, \T7, \T7
2289
2290 vpclmulqdq $0x00, \T3, \T2, \T2
2291
2292 vpxor \T2, \XMM1, \XMM1
2293
2294 ######################
2295
2296 vmovdqa HashKey_5(arg1), \T5
2297 vpshufd $0b01001110, \XMM4, \T2
2298 vpshufd $0b01001110, \T5, \T3
2299 vpxor \XMM4, \T2, \T2
2300 vpxor \T5, \T3, \T3
2301
2302 vpclmulqdq $0x11, \T5, \XMM4, \T4
2303 vpxor \T4, \T6, \T6
2304
2305 vpclmulqdq $0x00, \T5, \XMM4, \T4
2306 vpxor \T4, \T7, \T7
2307
2308 vpclmulqdq $0x00, \T3, \T2, \T2
2309
2310 vpxor \T2, \XMM1, \XMM1
2311
2312 ######################
2313
2314 vmovdqa HashKey_4(arg1), \T5
2315 vpshufd $0b01001110, \XMM5, \T2
2316 vpshufd $0b01001110, \T5, \T3
2317 vpxor \XMM5, \T2, \T2
2318 vpxor \T5, \T3, \T3
2319
2320 vpclmulqdq $0x11, \T5, \XMM5, \T4
2321 vpxor \T4, \T6, \T6
2322
2323 vpclmulqdq $0x00, \T5, \XMM5, \T4
2324 vpxor \T4, \T7, \T7
2325
2326 vpclmulqdq $0x00, \T3, \T2, \T2
2327
2328 vpxor \T2, \XMM1, \XMM1
2329
2330 ######################
2331
2332 vmovdqa HashKey_3(arg1), \T5
2333 vpshufd $0b01001110, \XMM6, \T2
2334 vpshufd $0b01001110, \T5, \T3
2335 vpxor \XMM6, \T2, \T2
2336 vpxor \T5, \T3, \T3
2337
2338 vpclmulqdq $0x11, \T5, \XMM6, \T4
2339 vpxor \T4, \T6, \T6
2340
2341 vpclmulqdq $0x00, \T5, \XMM6, \T4
2342 vpxor \T4, \T7, \T7
2343
2344 vpclmulqdq $0x00, \T3, \T2, \T2
2345
2346 vpxor \T2, \XMM1, \XMM1
2347
2348 ######################
2349
2350 vmovdqa HashKey_2(arg1), \T5
2351 vpshufd $0b01001110, \XMM7, \T2
2352 vpshufd $0b01001110, \T5, \T3
2353 vpxor \XMM7, \T2, \T2
2354 vpxor \T5, \T3, \T3
2355
2356 vpclmulqdq $0x11, \T5, \XMM7, \T4
2357 vpxor \T4, \T6, \T6
2358
2359 vpclmulqdq $0x00, \T5, \XMM7, \T4
2360 vpxor \T4, \T7, \T7
2361
2362 vpclmulqdq $0x00, \T3, \T2, \T2
2363
2364 vpxor \T2, \XMM1, \XMM1
2365
2366 ######################
2367
2368 vmovdqa HashKey(arg1), \T5
2369 vpshufd $0b01001110, \XMM8, \T2
2370 vpshufd $0b01001110, \T5, \T3
2371 vpxor \XMM8, \T2, \T2
2372 vpxor \T5, \T3, \T3
2373
2374 vpclmulqdq $0x11, \T5, \XMM8, \T4
2375 vpxor \T4, \T6, \T6
2376
2377 vpclmulqdq $0x00, \T5, \XMM8, \T4
2378 vpxor \T4, \T7, \T7
2379
2380 vpclmulqdq $0x00, \T3, \T2, \T2
2381
2382 vpxor \T2, \XMM1, \XMM1
2383 vpxor \T6, \XMM1, \XMM1
2384 vpxor \T7, \XMM1, \T2
2385
2386
2387
2388
2389 vpslldq $8, \T2, \T4
2390 vpsrldq $8, \T2, \T2
2391
2392 vpxor \T4, \T7, \T7
2393 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2394 # accumulated carry-less multiplications
2395
2396 #######################################################################
2397 #first phase of the reduction
2398 vmovdqa POLY2(%rip), \T3
2399
2400 vpclmulqdq $0x01, \T7, \T3, \T2
2401 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2402
2403 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2404 #######################################################################
2405
2406
2407 #second phase of the reduction
2408 vpclmulqdq $0x00, \T7, \T3, \T2
2409 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2410
2411 vpclmulqdq $0x10, \T7, \T3, \T4
2412 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2413
2414 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2415 #######################################################################
2416 vpxor \T4, \T6, \T6 # the result is in T6
2417.endm
2418
2419
2420
2421# combined for GCM encrypt and decrypt functions
2422# clobbering all xmm registers
2423# clobbering r10, r11, r12, r13, r14, r15
2424.macro GCM_ENC_DEC_AVX2 ENC_DEC
2425
2426 #the number of pushes must equal STACK_OFFSET
2427 push %r12
2428 push %r13
2429 push %r14
2430 push %r15
2431
2432 mov %rsp, %r14
2433
2434
2435
2436
2437 sub $VARIABLE_OFFSET, %rsp
2438 and $~63, %rsp # align rsp to 64 bytes
2439
2440
2441 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
2442
2443 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
2444 and $-16, %r13 # r13 = r13 - (r13 mod 16)
2445
2446 mov %r13, %r12
2447 shr $4, %r12
2448 and $7, %r12
2449 jz _initial_num_blocks_is_0\@
2450
2451 cmp $7, %r12
2452 je _initial_num_blocks_is_7\@
2453 cmp $6, %r12
2454 je _initial_num_blocks_is_6\@
2455 cmp $5, %r12
2456 je _initial_num_blocks_is_5\@
2457 cmp $4, %r12
2458 je _initial_num_blocks_is_4\@
2459 cmp $3, %r12
2460 je _initial_num_blocks_is_3\@
2461 cmp $2, %r12
2462 je _initial_num_blocks_is_2\@
2463
2464 jmp _initial_num_blocks_is_1\@
2465
2466_initial_num_blocks_is_7\@:
2467 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2468 sub $16*7, %r13
2469 jmp _initial_blocks_encrypted\@
2470
2471_initial_num_blocks_is_6\@:
2472 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2473 sub $16*6, %r13
2474 jmp _initial_blocks_encrypted\@
2475
2476_initial_num_blocks_is_5\@:
2477 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2478 sub $16*5, %r13
2479 jmp _initial_blocks_encrypted\@
2480
2481_initial_num_blocks_is_4\@:
2482 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2483 sub $16*4, %r13
2484 jmp _initial_blocks_encrypted\@
2485
2486_initial_num_blocks_is_3\@:
2487 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2488 sub $16*3, %r13
2489 jmp _initial_blocks_encrypted\@
2490
2491_initial_num_blocks_is_2\@:
2492 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2493 sub $16*2, %r13
2494 jmp _initial_blocks_encrypted\@
2495
2496_initial_num_blocks_is_1\@:
2497 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2498 sub $16*1, %r13
2499 jmp _initial_blocks_encrypted\@
2500
2501_initial_num_blocks_is_0\@:
2502 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2503
2504
2505_initial_blocks_encrypted\@:
2506 cmp $0, %r13
2507 je _zero_cipher_left\@
2508
2509 sub $128, %r13
2510 je _eight_cipher_left\@
2511
2512
2513
2514
2515 vmovd %xmm9, %r15d
2516 and $255, %r15d
2517 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2518
2519
2520_encrypt_by_8_new\@:
2521 cmp $(255-8), %r15d
2522 jg _encrypt_by_8\@
2523
2524
2525
2526 add $8, %r15b
2527 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2528 add $128, %r11
2529 sub $128, %r13
2530 jne _encrypt_by_8_new\@
2531
2532 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2533 jmp _eight_cipher_left\@
2534
2535_encrypt_by_8\@:
2536 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2537 add $8, %r15b
2538 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2539 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2540 add $128, %r11
2541 sub $128, %r13
2542 jne _encrypt_by_8_new\@
2543
2544 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2545
2546
2547
2548
2549_eight_cipher_left\@:
2550 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2551
2552
2553_zero_cipher_left\@:
2554 cmp $16, arg4
2555 jl _only_less_than_16\@
2556
2557 mov arg4, %r13
2558 and $15, %r13 # r13 = (arg4 mod 16)
2559
2560 je _multiple_of_16_bytes\@
2561
2562 # handle the last <16 Byte block seperately
2563
2564
2565 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2566 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2567 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2568
2569 sub $16, %r11
2570 add %r13, %r11
2571 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
2572
2573 lea SHIFT_MASK+16(%rip), %r12
2574 sub %r13, %r12 # adjust the shuffle mask pointer
2575 # to be able to shift 16-r13 bytes
2576 # (r13 is the number of bytes in plaintext mod 16)
2577 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
2578 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
2579 jmp _final_ghash_mul\@
2580
2581_only_less_than_16\@:
2582 # check for 0 length
2583 mov arg4, %r13
2584 and $15, %r13 # r13 = (arg4 mod 16)
2585
2586 je _multiple_of_16_bytes\@
2587
2588 # handle the last <16 Byte block seperately
2589
2590
2591 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2592 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2593 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2594
2595
2596 lea SHIFT_MASK+16(%rip), %r12
2597 sub %r13, %r12 # adjust the shuffle mask pointer to be
2598 # able to shift 16-r13 bytes (r13 is the
2599 # number of bytes in plaintext mod 16)
2600
2601_get_last_16_byte_loop\@:
2602 movb (arg3, %r11), %al
2603 movb %al, TMP1 (%rsp , %r11)
2604 add $1, %r11
2605 cmp %r13, %r11
2606 jne _get_last_16_byte_loop\@
2607
2608 vmovdqu TMP1(%rsp), %xmm1
2609
2610 sub $16, %r11
2611
2612_final_ghash_mul\@:
2613 .if \ENC_DEC == DEC
2614 vmovdqa %xmm1, %xmm2
2615 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2616 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2617 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2618 vpand %xmm1, %xmm2, %xmm2
2619 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2620 vpxor %xmm2, %xmm14, %xmm14
2621 #GHASH computation for the last <16 Byte block
2622 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2623 sub %r13, %r11
2624 add $16, %r11
2625 .else
2626 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2627 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2628 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2629 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2630 vpxor %xmm9, %xmm14, %xmm14
2631 #GHASH computation for the last <16 Byte block
2632 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2633 sub %r13, %r11
2634 add $16, %r11
2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
2636 .endif
2637
2638
2639 #############################
2640 # output r13 Bytes
2641 vmovq %xmm9, %rax
2642 cmp $8, %r13
2643 jle _less_than_8_bytes_left\@
2644
2645 mov %rax, (arg2 , %r11)
2646 add $8, %r11
2647 vpsrldq $8, %xmm9, %xmm9
2648 vmovq %xmm9, %rax
2649 sub $8, %r13
2650
2651_less_than_8_bytes_left\@:
2652 movb %al, (arg2 , %r11)
2653 add $1, %r11
2654 shr $8, %rax
2655 sub $1, %r13
2656 jne _less_than_8_bytes_left\@
2657 #############################
2658
2659_multiple_of_16_bytes\@:
2660 mov arg7, %r12 # r12 = aadLen (number of bytes)
2661 shl $3, %r12 # convert into number of bits
2662 vmovd %r12d, %xmm15 # len(A) in xmm15
2663
2664 shl $3, arg4 # len(C) in bits (*128)
2665 vmovq arg4, %xmm1
2666 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
2667 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
2668
2669 vpxor %xmm15, %xmm14, %xmm14
2670 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
2671 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
2672
2673 mov arg5, %rax # rax = *Y0
2674 vmovdqu (%rax), %xmm9 # xmm9 = Y0
2675
2676 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
2677
2678 vpxor %xmm14, %xmm9, %xmm9
2679
2680
2681
2682_return_T\@:
2683 mov arg8, %r10 # r10 = authTag
2684 mov arg9, %r11 # r11 = auth_tag_len
2685
2686 cmp $16, %r11
2687 je _T_16\@
2688
2689 cmp $12, %r11
2690 je _T_12\@
2691
2692_T_8\@:
2693 vmovq %xmm9, %rax
2694 mov %rax, (%r10)
2695 jmp _return_T_done\@
2696_T_12\@:
2697 vmovq %xmm9, %rax
2698 mov %rax, (%r10)
2699 vpsrldq $8, %xmm9, %xmm9
2700 vmovd %xmm9, %eax
2701 mov %eax, 8(%r10)
2702 jmp _return_T_done\@
2703
2704_T_16\@:
2705 vmovdqu %xmm9, (%r10)
2706
2707_return_T_done\@:
2708 mov %r14, %rsp
2709
2710 pop %r15
2711 pop %r14
2712 pop %r13
2713 pop %r12
2714.endm
2715
2716
2717#############################################################
2718#void aesni_gcm_precomp_avx_gen4
2719# (gcm_data *my_ctx_data,
2720# u8 *hash_subkey)# /* H, the Hash sub key input.
2721# Data starts on a 16-byte boundary. */
2722#############################################################
2723ENTRY(aesni_gcm_precomp_avx_gen4)
2724 #the number of pushes must equal STACK_OFFSET
2725 push %r12
2726 push %r13
2727 push %r14
2728 push %r15
2729
2730 mov %rsp, %r14
2731
2732
2733
2734 sub $VARIABLE_OFFSET, %rsp
2735 and $~63, %rsp # align rsp to 64 bytes
2736
2737 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2738
2739 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2740 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2741 vmovdqa %xmm6, %xmm2
2742 vpsllq $1, %xmm6, %xmm6
2743 vpsrlq $63, %xmm2, %xmm2
2744 vmovdqa %xmm2, %xmm1
2745 vpslldq $8, %xmm2, %xmm2
2746 vpsrldq $8, %xmm1, %xmm1
2747 vpor %xmm2, %xmm6, %xmm6
2748 #reduction
2749 vpshufd $0b00100100, %xmm1, %xmm2
2750 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2751 vpand POLY(%rip), %xmm2, %xmm2
2752 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2753 #######################################################################
2754 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2755
2756
2757 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2758
2759 mov %r14, %rsp
2760
2761 pop %r15
2762 pop %r14
2763 pop %r13
2764 pop %r12
2765 ret
2766ENDPROC(aesni_gcm_precomp_avx_gen4)
2767
2768
2769###############################################################################
2770#void aesni_gcm_enc_avx_gen4(
2771# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2772# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2773# const u8 *in, /* Plaintext input */
2774# u64 plaintext_len, /* Length of data in Bytes for encryption. */
2775# u8 *iv, /* Pre-counter block j0: 4 byte salt
2776# (from Security Association) concatenated with 8 byte
2777# Initialisation Vector (from IPSec ESP Payload)
2778# concatenated with 0x00000001. 16-byte aligned pointer. */
2779# const u8 *aad, /* Additional Authentication Data (AAD)*/
2780# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2781# u8 *auth_tag, /* Authenticated Tag output. */
2782# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2783# Valid values are 16 (most likely), 12 or 8. */
2784###############################################################################
2785ENTRY(aesni_gcm_enc_avx_gen4)
2786 GCM_ENC_DEC_AVX2 ENC
2787 ret
2788ENDPROC(aesni_gcm_enc_avx_gen4)
2789
2790###############################################################################
2791#void aesni_gcm_dec_avx_gen4(
2792# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2793# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2794# const u8 *in, /* Ciphertext input */
2795# u64 plaintext_len, /* Length of data in Bytes for encryption. */
2796# u8 *iv, /* Pre-counter block j0: 4 byte salt
2797# (from Security Association) concatenated with 8 byte
2798# Initialisation Vector (from IPSec ESP Payload)
2799# concatenated with 0x00000001. 16-byte aligned pointer. */
2800# const u8 *aad, /* Additional Authentication Data (AAD)*/
2801# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2802# u8 *auth_tag, /* Authenticated Tag output. */
2803# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2804# Valid values are 16 (most likely), 12 or 8. */
2805###############################################################################
2806ENTRY(aesni_gcm_dec_avx_gen4)
2807 GCM_ENC_DEC_AVX2 DEC
2808 ret
2809ENDPROC(aesni_gcm_dec_avx_gen4)
2810
2811#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 835488b745ee..948ad0e77741 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -101,6 +101,9 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
101int crypto_fpu_init(void); 101int crypto_fpu_init(void);
102void crypto_fpu_exit(void); 102void crypto_fpu_exit(void);
103 103
104#define AVX_GEN2_OPTSIZE 640
105#define AVX_GEN4_OPTSIZE 4096
106
104#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
105asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 108asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
106 const u8 *in, unsigned int len, u8 *iv); 109 const u8 *in, unsigned int len, u8 *iv);
@@ -150,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
150 u8 *hash_subkey, const u8 *aad, unsigned long aad_len, 153 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
151 u8 *auth_tag, unsigned long auth_tag_len); 154 u8 *auth_tag, unsigned long auth_tag_len);
152 155
156
157#ifdef CONFIG_AS_AVX
158/*
159 * asmlinkage void aesni_gcm_precomp_avx_gen2()
160 * gcm_data *my_ctx_data, context data
161 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
162 */
163asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
164
165asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
166 const u8 *in, unsigned long plaintext_len, u8 *iv,
167 const u8 *aad, unsigned long aad_len,
168 u8 *auth_tag, unsigned long auth_tag_len);
169
170asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
171 const u8 *in, unsigned long ciphertext_len, u8 *iv,
172 const u8 *aad, unsigned long aad_len,
173 u8 *auth_tag, unsigned long auth_tag_len);
174
175static void aesni_gcm_enc_avx(void *ctx, u8 *out,
176 const u8 *in, unsigned long plaintext_len, u8 *iv,
177 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
178 u8 *auth_tag, unsigned long auth_tag_len)
179{
180 if (plaintext_len < AVX_GEN2_OPTSIZE) {
181 aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
182 aad_len, auth_tag, auth_tag_len);
183 } else {
184 aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
185 aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
186 aad_len, auth_tag, auth_tag_len);
187 }
188}
189
190static void aesni_gcm_dec_avx(void *ctx, u8 *out,
191 const u8 *in, unsigned long ciphertext_len, u8 *iv,
192 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
193 u8 *auth_tag, unsigned long auth_tag_len)
194{
195 if (ciphertext_len < AVX_GEN2_OPTSIZE) {
196 aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
197 aad_len, auth_tag, auth_tag_len);
198 } else {
199 aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
200 aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
201 aad_len, auth_tag, auth_tag_len);
202 }
203}
204#endif
205
206#ifdef CONFIG_AS_AVX2
207/*
208 * asmlinkage void aesni_gcm_precomp_avx_gen4()
209 * gcm_data *my_ctx_data, context data
210 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
211 */
212asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
213
214asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
215 const u8 *in, unsigned long plaintext_len, u8 *iv,
216 const u8 *aad, unsigned long aad_len,
217 u8 *auth_tag, unsigned long auth_tag_len);
218
219asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
220 const u8 *in, unsigned long ciphertext_len, u8 *iv,
221 const u8 *aad, unsigned long aad_len,
222 u8 *auth_tag, unsigned long auth_tag_len);
223
224static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
225 const u8 *in, unsigned long plaintext_len, u8 *iv,
226 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
227 u8 *auth_tag, unsigned long auth_tag_len)
228{
229 if (plaintext_len < AVX_GEN2_OPTSIZE) {
230 aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
231 aad_len, auth_tag, auth_tag_len);
232 } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
233 aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
234 aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
235 aad_len, auth_tag, auth_tag_len);
236 } else {
237 aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
238 aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
239 aad_len, auth_tag, auth_tag_len);
240 }
241}
242
243static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
244 const u8 *in, unsigned long ciphertext_len, u8 *iv,
245 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
246 u8 *auth_tag, unsigned long auth_tag_len)
247{
248 if (ciphertext_len < AVX_GEN2_OPTSIZE) {
249 aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
250 aad, aad_len, auth_tag, auth_tag_len);
251 } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
252 aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
253 aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
254 aad_len, auth_tag, auth_tag_len);
255 } else {
256 aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
257 aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
258 aad_len, auth_tag, auth_tag_len);
259 }
260}
261#endif
262
263static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out,
264 const u8 *in, unsigned long plaintext_len, u8 *iv,
265 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
266 u8 *auth_tag, unsigned long auth_tag_len);
267
268static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
269 const u8 *in, unsigned long ciphertext_len, u8 *iv,
270 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
271 u8 *auth_tag, unsigned long auth_tag_len);
272
153static inline struct 273static inline struct
154aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) 274aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
155{ 275{
@@ -915,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
915 dst = src; 1035 dst = src;
916 } 1036 }
917 1037
918 aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, 1038 aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
919 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst 1039 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
920 + ((unsigned long)req->cryptlen), auth_tag_len); 1040 + ((unsigned long)req->cryptlen), auth_tag_len);
921 1041
@@ -996,12 +1116,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
996 dst = src; 1116 dst = src;
997 } 1117 }
998 1118
999 aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, 1119 aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
1000 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, 1120 ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
1001 authTag, auth_tag_len); 1121 authTag, auth_tag_len);
1002 1122
1003 /* Compare generated tag with passed in tag. */ 1123 /* Compare generated tag with passed in tag. */
1004 retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? 1124 retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
1005 -EBADMSG : 0; 1125 -EBADMSG : 0;
1006 1126
1007 if (one_entry_in_sg) { 1127 if (one_entry_in_sg) {
@@ -1353,6 +1473,27 @@ static int __init aesni_init(void)
1353 1473
1354 if (!x86_match_cpu(aesni_cpu_id)) 1474 if (!x86_match_cpu(aesni_cpu_id))
1355 return -ENODEV; 1475 return -ENODEV;
1476#ifdef CONFIG_X86_64
1477#ifdef CONFIG_AS_AVX2
1478 if (boot_cpu_has(X86_FEATURE_AVX2)) {
1479 pr_info("AVX2 version of gcm_enc/dec engaged.\n");
1480 aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
1481 aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
1482 } else
1483#endif
1484#ifdef CONFIG_AS_AVX
1485 if (boot_cpu_has(X86_FEATURE_AVX)) {
1486 pr_info("AVX version of gcm_enc/dec engaged.\n");
1487 aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
1488 aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
1489 } else
1490#endif
1491 {
1492 pr_info("SSE version of gcm_enc/dec engaged.\n");
1493 aesni_gcm_enc_tfm = aesni_gcm_enc;
1494 aesni_gcm_dec_tfm = aesni_gcm_dec;
1495 }
1496#endif
1356 1497
1357 err = crypto_fpu_init(); 1498 err = crypto_fpu_init();
1358 if (err) 1499 if (err)
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index 0d9ec770f2f8..e6a92455740e 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -39,6 +39,20 @@
39 39
40#ifdef CONFIG_ARCH_RANDOM 40#ifdef CONFIG_ARCH_RANDOM
41 41
42/* Instead of arch_get_random_long() when alternatives haven't run. */
43static inline int rdrand_long(unsigned long *v)
44{
45 int ok;
46 asm volatile("1: " RDRAND_LONG "\n\t"
47 "jc 2f\n\t"
48 "decl %0\n\t"
49 "jnz 1b\n\t"
50 "2:"
51 : "=r" (ok), "=a" (*v)
52 : "0" (RDRAND_RETRY_LOOPS));
53 return ok;
54}
55
42#define GET_RANDOM(name, type, rdrand, nop) \ 56#define GET_RANDOM(name, type, rdrand, nop) \
43static inline int name(type *v) \ 57static inline int name(type *v) \
44{ \ 58{ \
@@ -68,6 +82,13 @@ GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3);
68 82
69#endif /* CONFIG_X86_64 */ 83#endif /* CONFIG_X86_64 */
70 84
85#else
86
87static inline int rdrand_long(unsigned long *v)
88{
89 return 0;
90}
91
71#endif /* CONFIG_ARCH_RANDOM */ 92#endif /* CONFIG_ARCH_RANDOM */
72 93
73extern void x86_init_rdrand(struct cpuinfo_x86 *c); 94extern void x86_init_rdrand(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index c6cd358a1eec..04a48903b2eb 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -92,12 +92,53 @@
92#endif 92#endif
93#define smp_read_barrier_depends() read_barrier_depends() 93#define smp_read_barrier_depends() read_barrier_depends()
94#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) 94#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
95#else 95#else /* !SMP */
96#define smp_mb() barrier() 96#define smp_mb() barrier()
97#define smp_rmb() barrier() 97#define smp_rmb() barrier()
98#define smp_wmb() barrier() 98#define smp_wmb() barrier()
99#define smp_read_barrier_depends() do { } while (0) 99#define smp_read_barrier_depends() do { } while (0)
100#define set_mb(var, value) do { var = value; barrier(); } while (0) 100#define set_mb(var, value) do { var = value; barrier(); } while (0)
101#endif /* SMP */
102
103#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
104
105/*
106 * For either of these options x86 doesn't have a strong TSO memory
107 * model and we should fall back to full barriers.
108 */
109
110#define smp_store_release(p, v) \
111do { \
112 compiletime_assert_atomic_type(*p); \
113 smp_mb(); \
114 ACCESS_ONCE(*p) = (v); \
115} while (0)
116
117#define smp_load_acquire(p) \
118({ \
119 typeof(*p) ___p1 = ACCESS_ONCE(*p); \
120 compiletime_assert_atomic_type(*p); \
121 smp_mb(); \
122 ___p1; \
123})
124
125#else /* regular x86 TSO memory ordering */
126
127#define smp_store_release(p, v) \
128do { \
129 compiletime_assert_atomic_type(*p); \
130 barrier(); \
131 ACCESS_ONCE(*p) = (v); \
132} while (0)
133
134#define smp_load_acquire(p) \
135({ \
136 typeof(*p) ___p1 = ACCESS_ONCE(*p); \
137 compiletime_assert_atomic_type(*p); \
138 barrier(); \
139 ___p1; \
140})
141
101#endif 142#endif
102 143
103/* 144/*
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 89270b4318db..e099f9502ace 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -216,6 +216,7 @@
216#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ 216#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
217#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ 217#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
218#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ 218#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
219#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */
219#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ 220#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
220#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ 221#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
221#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ 222#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index fd8f9e2ca35f..535192f6bfad 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,7 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
13} 13}
14 14
15/* Use early IO mappings for DMI because it's initialized early */ 15/* Use early IO mappings for DMI because it's initialized early */
16#define dmi_ioremap early_ioremap 16#define dmi_early_remap early_ioremap
17#define dmi_iounmap early_iounmap 17#define dmi_early_unmap early_iounmap
18#define dmi_remap ioremap
19#define dmi_unmap iounmap
18 20
19#endif /* _ASM_X86_DMI_H */ 21#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 65c6e6e3a552..3b978c472d08 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,6 +1,24 @@
1#ifndef _ASM_X86_EFI_H 1#ifndef _ASM_X86_EFI_H
2#define _ASM_X86_EFI_H 2#define _ASM_X86_EFI_H
3 3
4/*
5 * We map the EFI regions needed for runtime services non-contiguously,
6 * with preserved alignment on virtual addresses starting from -4G down
7 * for a total max space of 64G. This way, we provide for stable runtime
8 * services addresses across kernels so that a kexec'd kernel can still
9 * use them.
10 *
11 * This is the main reason why we're doing stable VA mappings for RT
12 * services.
13 *
14 * This flag is used in conjuction with a chicken bit called
15 * "efi=old_map" which can be used as a fallback to the old runtime
16 * services mapping method in case there's some b0rkage with a
17 * particular EFI implementation (haha, it is hard to hold up the
18 * sarcasm here...).
19 */
20#define EFI_OLD_MEMMAP EFI_ARCH_1
21
4#ifdef CONFIG_X86_32 22#ifdef CONFIG_X86_32
5 23
6#define EFI_LOADER_SIGNATURE "EL32" 24#define EFI_LOADER_SIGNATURE "EL32"
@@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
69 efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ 87 efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \
70 (u64)(a4), (u64)(a5), (u64)(a6)) 88 (u64)(a4), (u64)(a5), (u64)(a6))
71 89
90#define _efi_call_virtX(x, f, ...) \
91({ \
92 efi_status_t __s; \
93 \
94 efi_sync_low_kernel_mappings(); \
95 preempt_disable(); \
96 __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \
97 preempt_enable(); \
98 __s; \
99})
100
72#define efi_call_virt0(f) \ 101#define efi_call_virt0(f) \
73 efi_call0((efi.systab->runtime->f)) 102 _efi_call_virtX(0, f)
74#define efi_call_virt1(f, a1) \ 103#define efi_call_virt1(f, a1) \
75 efi_call1((efi.systab->runtime->f), (u64)(a1)) 104 _efi_call_virtX(1, f, (u64)(a1))
76#define efi_call_virt2(f, a1, a2) \ 105#define efi_call_virt2(f, a1, a2) \
77 efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2)) 106 _efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
78#define efi_call_virt3(f, a1, a2, a3) \ 107#define efi_call_virt3(f, a1, a2, a3) \
79 efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 108 _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
80 (u64)(a3)) 109#define efi_call_virt4(f, a1, a2, a3, a4) \
81#define efi_call_virt4(f, a1, a2, a3, a4) \ 110 _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
82 efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 111#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
83 (u64)(a3), (u64)(a4)) 112 _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
84#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ 113#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
85 efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 114 _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
86 (u64)(a3), (u64)(a4), (u64)(a5))
87#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
88 efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
89 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
90 115
91extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, 116extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
92 u32 type, u64 attribute); 117 u32 type, u64 attribute);
@@ -95,12 +120,28 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
95 120
96extern int add_efi_memmap; 121extern int add_efi_memmap;
97extern unsigned long x86_efi_facility; 122extern unsigned long x86_efi_facility;
123extern struct efi_scratch efi_scratch;
98extern void efi_set_executable(efi_memory_desc_t *md, bool executable); 124extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
99extern int efi_memblock_x86_reserve_range(void); 125extern int efi_memblock_x86_reserve_range(void);
100extern void efi_call_phys_prelog(void); 126extern void efi_call_phys_prelog(void);
101extern void efi_call_phys_epilog(void); 127extern void efi_call_phys_epilog(void);
102extern void efi_unmap_memmap(void); 128extern void efi_unmap_memmap(void);
103extern void efi_memory_uc(u64 addr, unsigned long size); 129extern void efi_memory_uc(u64 addr, unsigned long size);
130extern void __init efi_map_region(efi_memory_desc_t *md);
131extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
132extern void efi_sync_low_kernel_mappings(void);
133extern void efi_setup_page_tables(void);
134extern void __init old_map_region(efi_memory_desc_t *md);
135
136struct efi_setup_data {
137 u64 fw_vendor;
138 u64 runtime;
139 u64 tables;
140 u64 smbios;
141 u64 reserved[8];
142};
143
144extern u64 efi_setup;
104 145
105#ifdef CONFIG_EFI 146#ifdef CONFIG_EFI
106 147
@@ -110,7 +151,7 @@ static inline bool efi_is_native(void)
110} 151}
111 152
112extern struct console early_efi_console; 153extern struct console early_efi_console;
113 154extern void parse_efi_setup(u64 phys_addr, u32 data_len);
114#else 155#else
115/* 156/*
116 * IF EFI is not configured, have the EFI calls return -ENOSYS. 157 * IF EFI is not configured, have the EFI calls return -ENOSYS.
@@ -122,6 +163,7 @@ extern struct console early_efi_console;
122#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) 163#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
123#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) 164#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
124#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) 165#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
166static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
125#endif /* CONFIG_EFI */ 167#endif /* CONFIG_EFI */
126 168
127#endif /* _ASM_X86_EFI_H */ 169#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index e846225265ed..7252cd339175 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -175,64 +175,7 @@ static inline void __set_fixmap(enum fixed_addresses idx,
175} 175}
176#endif 176#endif
177 177
178#define set_fixmap(idx, phys) \ 178#include <asm-generic/fixmap.h>
179 __set_fixmap(idx, phys, PAGE_KERNEL)
180
181/*
182 * Some hardware wants to get fixmapped without caching.
183 */
184#define set_fixmap_nocache(idx, phys) \
185 __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
186
187#define clear_fixmap(idx) \
188 __set_fixmap(idx, 0, __pgprot(0))
189
190#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
191#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
192
193extern void __this_fixmap_does_not_exist(void);
194
195/*
196 * 'index to address' translation. If anyone tries to use the idx
197 * directly without translation, we catch the bug with a NULL-deference
198 * kernel oops. Illegal ranges of incoming indices are caught too.
199 */
200static __always_inline unsigned long fix_to_virt(const unsigned int idx)
201{
202 /*
203 * this branch gets completely eliminated after inlining,
204 * except when someone tries to use fixaddr indices in an
205 * illegal way. (such as mixing up address types or using
206 * out-of-range indices).
207 *
208 * If it doesn't get removed, the linker will complain
209 * loudly with a reasonably clear error message..
210 */
211 if (idx >= __end_of_fixed_addresses)
212 __this_fixmap_does_not_exist();
213
214 return __fix_to_virt(idx);
215}
216
217static inline unsigned long virt_to_fix(const unsigned long vaddr)
218{
219 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
220 return __virt_to_fix(vaddr);
221}
222
223/* Return an pointer with offset calculated */
224static __always_inline unsigned long
225__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
226{
227 __set_fixmap(idx, phys, flags);
228 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
229}
230
231#define set_fixmap_offset(idx, phys) \
232 __set_fixmap_offset(idx, phys, PAGE_KERNEL)
233
234#define set_fixmap_offset_nocache(idx, phys) \
235 __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
236 179
237#endif /* !__ASSEMBLY__ */ 180#endif /* !__ASSEMBLY__ */
238#endif /* _ASM_X86_FIXMAP_H */ 181#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index c49a613c6452..cea1c76d49bf 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -293,12 +293,13 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
293 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception 293 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
294 is pending. Clear the x87 state here by setting it to fixed 294 is pending. Clear the x87 state here by setting it to fixed
295 values. "m" is a random variable that should be in L1 */ 295 values. "m" is a random variable that should be in L1 */
296 alternative_input( 296 if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) {
297 ASM_NOP8 ASM_NOP2, 297 asm volatile(
298 "emms\n\t" /* clear stack tags */ 298 "fnclex\n\t"
299 "fildl %P[addr]", /* set F?P to defined value */ 299 "emms\n\t"
300 X86_FEATURE_FXSAVE_LEAK, 300 "fildl %P[addr]" /* set F?P to defined value */
301 [addr] "m" (tsk->thread.fpu.has_fpu)); 301 : : [addr] "m" (tsk->thread.fpu.has_fpu));
302 }
302 303
303 return fpu_restore_checking(&tsk->thread.fpu); 304 return fpu_restore_checking(&tsk->thread.fpu);
304} 305}
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index be27ba1e947a..b4c1f5453436 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -110,26 +110,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
110static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, 110static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
111 u32 oldval, u32 newval) 111 u32 oldval, u32 newval)
112{ 112{
113 int ret = 0; 113 return user_atomic_cmpxchg_inatomic(uval, uaddr, oldval, newval);
114
115 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
116 return -EFAULT;
117
118 asm volatile("\t" ASM_STAC "\n"
119 "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
120 "2:\t" ASM_CLAC "\n"
121 "\t.section .fixup, \"ax\"\n"
122 "3:\tmov %3, %0\n"
123 "\tjmp 2b\n"
124 "\t.previous\n"
125 _ASM_EXTABLE(1b, 3b)
126 : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
127 : "i" (-EFAULT), "r" (newval), "1" (oldval)
128 : "memory"
129 );
130
131 *uval = oldval;
132 return ret;
133} 114}
134 115
135#endif 116#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index cba45d99ac1a..67d69b8e2d20 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -191,6 +191,9 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
191#define trace_interrupt interrupt 191#define trace_interrupt interrupt
192#endif 192#endif
193 193
194#define VECTOR_UNDEFINED -1
195#define VECTOR_RETRIGGERED -2
196
194typedef int vector_irq_t[NR_VECTORS]; 197typedef int vector_irq_t[NR_VECTORS];
195DECLARE_PER_CPU(vector_irq_t, vector_irq); 198DECLARE_PER_CPU(vector_irq_t, vector_irq);
196extern void setup_vector_irq(int cpu); 199extern void setup_vector_irq(int cpu);
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 459769d39263..e34e097b6f9d 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -51,10 +51,41 @@ struct devs_id {
51enum intel_mid_cpu_type { 51enum intel_mid_cpu_type {
52 /* 1 was Moorestown */ 52 /* 1 was Moorestown */
53 INTEL_MID_CPU_CHIP_PENWELL = 2, 53 INTEL_MID_CPU_CHIP_PENWELL = 2,
54 INTEL_MID_CPU_CHIP_CLOVERVIEW,
55 INTEL_MID_CPU_CHIP_TANGIER,
54}; 56};
55 57
56extern enum intel_mid_cpu_type __intel_mid_cpu_chip; 58extern enum intel_mid_cpu_type __intel_mid_cpu_chip;
57 59
60/**
61 * struct intel_mid_ops - Interface between intel-mid & sub archs
62 * @arch_setup: arch_setup function to re-initialize platform
63 * structures (x86_init, x86_platform_init)
64 *
65 * This structure can be extended if any new interface is required
66 * between intel-mid & its sub arch files.
67 */
68struct intel_mid_ops {
69 void (*arch_setup)(void);
70};
71
72/* Helper API's for INTEL_MID_OPS_INIT */
73#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \
74 [cpuid] = get_##cpuname##_ops
75
76/* Maximum number of CPU ops */
77#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *))
78
79/*
80 * For every new cpu addition, a weak get_<cpuname>_ops() function needs be
81 * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h.
82 */
83#define INTEL_MID_OPS_INIT {\
84 DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \
85 DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \
86 DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \
87};
88
58#ifdef CONFIG_X86_INTEL_MID 89#ifdef CONFIG_X86_INTEL_MID
59 90
60static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) 91static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void)
@@ -86,8 +117,21 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
86 * Penwell uses spread spectrum clock, so the freq number is not exactly 117 * Penwell uses spread spectrum clock, so the freq number is not exactly
87 * the same as reported by MSR based on SDM. 118 * the same as reported by MSR based on SDM.
88 */ 119 */
89#define PENWELL_FSB_FREQ_83SKU 83200 120#define FSB_FREQ_83SKU 83200
90#define PENWELL_FSB_FREQ_100SKU 99840 121#define FSB_FREQ_100SKU 99840
122#define FSB_FREQ_133SKU 133000
123
124#define FSB_FREQ_167SKU 167000
125#define FSB_FREQ_200SKU 200000
126#define FSB_FREQ_267SKU 267000
127#define FSB_FREQ_333SKU 333000
128#define FSB_FREQ_400SKU 400000
129
130/* Bus Select SoC Fuse value */
131#define BSEL_SOC_FUSE_MASK 0x7
132#define BSEL_SOC_FUSE_001 0x1 /* FSB 133MHz */
133#define BSEL_SOC_FUSE_101 0x5 /* FSB 100MHz */
134#define BSEL_SOC_FUSE_111 0x7 /* FSB 83MHz */
91 135
92#define SFI_MTMR_MAX_NUM 8 136#define SFI_MTMR_MAX_NUM 8
93#define SFI_MRTC_MAX 8 137#define SFI_MRTC_MAX 8
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
new file mode 100644
index 000000000000..8e71c7941767
--- /dev/null
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -0,0 +1,90 @@
1/*
2 * iosf_mbi.h: Intel OnChip System Fabric MailBox access support
3 */
4
5#ifndef IOSF_MBI_SYMS_H
6#define IOSF_MBI_SYMS_H
7
8#define MBI_MCR_OFFSET 0xD0
9#define MBI_MDR_OFFSET 0xD4
10#define MBI_MCRX_OFFSET 0xD8
11
12#define MBI_RD_MASK 0xFEFFFFFF
13#define MBI_WR_MASK 0X01000000
14
15#define MBI_MASK_HI 0xFFFFFF00
16#define MBI_MASK_LO 0x000000FF
17#define MBI_ENABLE 0xF0
18
19/* Baytrail available units */
20#define BT_MBI_UNIT_AUNIT 0x00
21#define BT_MBI_UNIT_SMC 0x01
22#define BT_MBI_UNIT_CPU 0x02
23#define BT_MBI_UNIT_BUNIT 0x03
24#define BT_MBI_UNIT_PMC 0x04
25#define BT_MBI_UNIT_GFX 0x06
26#define BT_MBI_UNIT_SMI 0x0C
27#define BT_MBI_UNIT_USB 0x43
28#define BT_MBI_UNIT_SATA 0xA3
29#define BT_MBI_UNIT_PCIE 0xA6
30
31/* Baytrail read/write opcodes */
32#define BT_MBI_AUNIT_READ 0x10
33#define BT_MBI_AUNIT_WRITE 0x11
34#define BT_MBI_SMC_READ 0x10
35#define BT_MBI_SMC_WRITE 0x11
36#define BT_MBI_CPU_READ 0x10
37#define BT_MBI_CPU_WRITE 0x11
38#define BT_MBI_BUNIT_READ 0x10
39#define BT_MBI_BUNIT_WRITE 0x11
40#define BT_MBI_PMC_READ 0x06
41#define BT_MBI_PMC_WRITE 0x07
42#define BT_MBI_GFX_READ 0x00
43#define BT_MBI_GFX_WRITE 0x01
44#define BT_MBI_SMIO_READ 0x06
45#define BT_MBI_SMIO_WRITE 0x07
46#define BT_MBI_USB_READ 0x06
47#define BT_MBI_USB_WRITE 0x07
48#define BT_MBI_SATA_READ 0x00
49#define BT_MBI_SATA_WRITE 0x01
50#define BT_MBI_PCIE_READ 0x00
51#define BT_MBI_PCIE_WRITE 0x01
52
53/**
54 * iosf_mbi_read() - MailBox Interface read command
55 * @port: port indicating subunit being accessed
56 * @opcode: port specific read or write opcode
57 * @offset: register address offset
58 * @mdr: register data to be read
59 *
60 * Locking is handled by spinlock - cannot sleep.
61 * Return: Nonzero on error
62 */
63int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr);
64
65/**
66 * iosf_mbi_write() - MailBox unmasked write command
67 * @port: port indicating subunit being accessed
68 * @opcode: port specific read or write opcode
69 * @offset: register address offset
70 * @mdr: register data to be written
71 *
72 * Locking is handled by spinlock - cannot sleep.
73 * Return: Nonzero on error
74 */
75int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr);
76
77/**
78 * iosf_mbi_modify() - MailBox masked write command
79 * @port: port indicating subunit being accessed
80 * @opcode: port specific read or write opcode
81 * @offset: register address offset
82 * @mdr: register data being modified
83 * @mask: mask indicating bits in mdr to be modified
84 *
85 * Locking is handled by spinlock - cannot sleep.
86 * Return: Nonzero on error
87 */
88int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
89
90#endif /* IOSF_MBI_SYMS_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 0ea10f27d613..cb6cfcd034cf 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -25,6 +25,7 @@ extern void irq_ctx_init(int cpu);
25 25
26#ifdef CONFIG_HOTPLUG_CPU 26#ifdef CONFIG_HOTPLUG_CPU
27#include <linux/cpumask.h> 27#include <linux/cpumask.h>
28extern int check_irq_vectors_for_cpu_disable(void);
28extern void fixup_irqs(void); 29extern void fixup_irqs(void);
29extern void irq_force_complete_move(int); 30extern void irq_force_complete_move(int);
30#endif 31#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ae5d7830855c..fdf83afbb7d9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -605,6 +605,7 @@ struct kvm_arch {
605 /* fields used by HYPER-V emulation */ 605 /* fields used by HYPER-V emulation */
606 u64 hv_guest_os_id; 606 u64 hv_guest_os_id;
607 u64 hv_hypercall; 607 u64 hv_hypercall;
608 u64 hv_tsc_page;
608 609
609 #ifdef CONFIG_KVM_MMU_AUDIT 610 #ifdef CONFIG_KVM_MMU_AUDIT
610 int audit_point; 611 int audit_point;
@@ -699,6 +700,8 @@ struct kvm_x86_ops {
699 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 700 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
700 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 701 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
701 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 702 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
703 u64 (*get_dr6)(struct kvm_vcpu *vcpu);
704 void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
702 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); 705 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
703 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 706 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
704 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 707 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c696a8687567..6e4ce2df87cf 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -118,7 +118,6 @@ extern void mce_register_decode_chain(struct notifier_block *nb);
118extern void mce_unregister_decode_chain(struct notifier_block *nb); 118extern void mce_unregister_decode_chain(struct notifier_block *nb);
119 119
120#include <linux/percpu.h> 120#include <linux/percpu.h>
121#include <linux/init.h>
122#include <linux/atomic.h> 121#include <linux/atomic.h>
123 122
124extern int mce_p5_enabled; 123extern int mce_p5_enabled;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index f98bd6625318..b59827e76529 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,21 @@
1#ifndef _ASM_X86_MICROCODE_H 1#ifndef _ASM_X86_MICROCODE_H
2#define _ASM_X86_MICROCODE_H 2#define _ASM_X86_MICROCODE_H
3 3
4#define native_rdmsr(msr, val1, val2) \
5do { \
6 u64 __val = native_read_msr((msr)); \
7 (void)((val1) = (u32)__val); \
8 (void)((val2) = (u32)(__val >> 32)); \
9} while (0)
10
11#define native_wrmsr(msr, low, high) \
12 native_write_msr(msr, low, high)
13
14#define native_wrmsrl(msr, val) \
15 native_write_msr((msr), \
16 (u32)((u64)(val)), \
17 (u32)((u64)(val) >> 32))
18
4struct cpu_signature { 19struct cpu_signature {
5 unsigned int sig; 20 unsigned int sig;
6 unsigned int pf; 21 unsigned int pf;
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 4c019179a57d..b7b10b82d3e5 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -61,11 +61,10 @@ extern int __apply_microcode_amd(struct microcode_amd *mc_amd);
61extern int apply_microcode_amd(int cpu); 61extern int apply_microcode_amd(int cpu);
62extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); 62extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
63 63
64#define PATCH_MAX_SIZE PAGE_SIZE
65extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
66
64#ifdef CONFIG_MICROCODE_AMD_EARLY 67#ifdef CONFIG_MICROCODE_AMD_EARLY
65#ifdef CONFIG_X86_32
66#define MPB_MAX_SIZE PAGE_SIZE
67extern u8 amd_bsp_mpb[MPB_MAX_SIZE];
68#endif
69extern void __init load_ucode_amd_bsp(void); 68extern void __init load_ucode_amd_bsp(void);
70extern void load_ucode_amd_ap(void); 69extern void load_ucode_amd_ap(void);
71extern int __init save_microcode_in_initrd_amd(void); 70extern int __init save_microcode_in_initrd_amd(void);
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 3142a94c7b4b..3e6b4920ef5d 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -1,7 +1,6 @@
1#ifndef _ASM_X86_MPSPEC_H 1#ifndef _ASM_X86_MPSPEC_H
2#define _ASM_X86_MPSPEC_H 2#define _ASM_X86_MPSPEC_H
3 3
4#include <linux/init.h>
5 4
6#include <asm/mpspec_def.h> 5#include <asm/mpspec_def.h>
7#include <asm/x86_init.h> 6#include <asm/x86_init.h>
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 2f366d0ac6b4..1da25a5f96f9 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_MWAIT_H 1#ifndef _ASM_X86_MWAIT_H
2#define _ASM_X86_MWAIT_H 2#define _ASM_X86_MWAIT_H
3 3
4#include <linux/sched.h>
5
4#define MWAIT_SUBSTATE_MASK 0xf 6#define MWAIT_SUBSTATE_MASK 0xf
5#define MWAIT_CSTATE_MASK 0xf 7#define MWAIT_CSTATE_MASK 0xf
6#define MWAIT_SUBSTATE_SIZE 4 8#define MWAIT_SUBSTATE_SIZE 4
@@ -13,4 +15,45 @@
13 15
14#define MWAIT_ECX_INTERRUPT_BREAK 0x1 16#define MWAIT_ECX_INTERRUPT_BREAK 0x1
15 17
18static inline void __monitor(const void *eax, unsigned long ecx,
19 unsigned long edx)
20{
21 /* "monitor %eax, %ecx, %edx;" */
22 asm volatile(".byte 0x0f, 0x01, 0xc8;"
23 :: "a" (eax), "c" (ecx), "d"(edx));
24}
25
26static inline void __mwait(unsigned long eax, unsigned long ecx)
27{
28 /* "mwait %eax, %ecx;" */
29 asm volatile(".byte 0x0f, 0x01, 0xc9;"
30 :: "a" (eax), "c" (ecx));
31}
32
33/*
34 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
35 * which can obviate IPI to trigger checking of need_resched.
36 * We execute MONITOR against need_resched and enter optimized wait state
37 * through MWAIT. Whenever someone changes need_resched, we would be woken
38 * up from MWAIT (without an IPI).
39 *
40 * New with Core Duo processors, MWAIT can take some hints based on CPU
41 * capability.
42 */
43static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
44{
45 if (!current_set_polling_and_test()) {
46 if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
47 mb();
48 clflush((void *)&current_thread_info()->flags);
49 mb();
50 }
51
52 __monitor((void *)&current_thread_info()->flags, 0, 0);
53 if (!need_resched())
54 __mwait(eax, ecx);
55 }
56 current_clr_polling();
57}
58
16#endif /* _ASM_X86_MWAIT_H */ 59#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index c87892442e53..775873d3be55 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -71,6 +71,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
71#include <asm-generic/getorder.h> 71#include <asm-generic/getorder.h>
72 72
73#define __HAVE_ARCH_GATE_AREA 1 73#define __HAVE_ARCH_GATE_AREA 1
74#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
74 75
75#endif /* __KERNEL__ */ 76#endif /* __KERNEL__ */
76#endif /* _ASM_X86_PAGE_H */ 77#endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index 4d550d04b609..904f528cc8e8 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -5,10 +5,6 @@
5 5
6#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
7 7
8#ifdef CONFIG_HUGETLB_PAGE
9#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
10#endif
11
12#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) 8#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
13#ifdef CONFIG_DEBUG_VIRTUAL 9#ifdef CONFIG_DEBUG_VIRTUAL
14extern unsigned long __phys_addr(unsigned long); 10extern unsigned long __phys_addr(unsigned long);
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 43dcd804ebd5..8de6d9cf3b95 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -39,9 +39,18 @@
39#define __VIRTUAL_MASK_SHIFT 47 39#define __VIRTUAL_MASK_SHIFT 47
40 40
41/* 41/*
42 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in 42 * Kernel image size is limited to 1GiB due to the fixmap living in the
43 * arch/x86/kernel/head_64.S), and it is mapped here: 43 * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
44 * 512MiB by default, leaving 1.5GiB for modules once the page tables
45 * are fully set up. If kernel ASLR is configured, it can extend the
46 * kernel page table mapping, reducing the size of the modules area.
44 */ 47 */
45#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) 48#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024)
49#if defined(CONFIG_RANDOMIZE_BASE) && \
50 CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT
51#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET
52#else
53#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT
54#endif
46 55
47#endif /* _ASM_X86_PAGE_64_DEFS_H */ 56#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..2f59cce3b38a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,9 +51,9 @@ extern int devmem_is_allowed(unsigned long pagenr);
51extern unsigned long max_low_pfn_mapped; 51extern unsigned long max_low_pfn_mapped;
52extern unsigned long max_pfn_mapped; 52extern unsigned long max_pfn_mapped;
53 53
54static inline phys_addr_t get_max_mapped(void) 54static inline phys_addr_t get_max_low_mapped(void)
55{ 55{
56 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; 56 return (phys_addr_t)max_low_pfn_mapped << PAGE_SHIFT;
57} 57}
58 58
59bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); 59bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 947b5c417e83..1ac6114c9ea5 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -104,7 +104,7 @@ extern void pci_iommu_alloc(void);
104struct msi_desc; 104struct msi_desc;
105int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); 105int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
106void native_teardown_msi_irq(unsigned int irq); 106void native_teardown_msi_irq(unsigned int irq);
107void native_restore_msi_irqs(struct pci_dev *dev, int irq); 107void native_restore_msi_irqs(struct pci_dev *dev);
108int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, 108int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
109 unsigned int irq_base, unsigned int irq_offset); 109 unsigned int irq_base, unsigned int irq_offset);
110#else 110#else
@@ -125,7 +125,6 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
125 125
126/* generic pci stuff */ 126/* generic pci stuff */
127#include <asm-generic/pci.h> 127#include <asm-generic/pci.h>
128#define PCIBIOS_MAX_MEM_32 0xffffffff
129 128
130#ifdef CONFIG_NUMA 129#ifdef CONFIG_NUMA
131/* Returns the node based on pci bus */ 130/* Returns the node based on pci bus */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 3bf2dd0cf61f..0d193e234647 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -55,6 +55,13 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif 56#endif
57 57
58/* Bit manipulation helper on pte/pgoff entry */
59static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
60 unsigned long mask, unsigned int leftshift)
61{
62 return ((value >> rightshift) & mask) << leftshift;
63}
64
58#ifdef CONFIG_MEM_SOFT_DIRTY 65#ifdef CONFIG_MEM_SOFT_DIRTY
59 66
60/* 67/*
@@ -71,31 +78,34 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
71#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) 78#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
72#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) 79#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
73 80
74#define pte_to_pgoff(pte) \ 81#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1)
75 ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ 82#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1)
76 & ((1U << PTE_FILE_BITS1) - 1))) \ 83#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1)
77 + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ 84
78 & ((1U << PTE_FILE_BITS2) - 1)) \ 85#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1)
79 << (PTE_FILE_BITS1)) \ 86#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2)
80 + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ 87#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)
81 & ((1U << PTE_FILE_BITS3) - 1)) \ 88
82 << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 89static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
83 + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ 90{
84 << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) 91 return (pgoff_t)
85 92 (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) +
86#define pgoff_to_pte(off) \ 93 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) +
87 ((pte_t) { .pte_low = \ 94 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) +
88 ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ 95 pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4));
89 + ((((off) >> PTE_FILE_BITS1) \ 96}
90 & ((1U << PTE_FILE_BITS2) - 1)) \ 97
91 << PTE_FILE_SHIFT2) \ 98static __always_inline pte_t pgoff_to_pte(pgoff_t off)
92 + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 99{
93 & ((1U << PTE_FILE_BITS3) - 1)) \ 100 return (pte_t){
94 << PTE_FILE_SHIFT3) \ 101 .pte_low =
95 + ((((off) >> \ 102 pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) +
96 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ 103 pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) +
97 << PTE_FILE_SHIFT4) \ 104 pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) +
98 + _PAGE_FILE }) 105 pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) +
106 _PAGE_FILE,
107 };
108}
99 109
100#else /* CONFIG_MEM_SOFT_DIRTY */ 110#else /* CONFIG_MEM_SOFT_DIRTY */
101 111
@@ -115,22 +125,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
115#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) 125#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
116#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) 126#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
117 127
118#define pte_to_pgoff(pte) \ 128#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1)
119 ((((pte).pte_low >> PTE_FILE_SHIFT1) \ 129#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1)
120 & ((1U << PTE_FILE_BITS1) - 1)) \ 130
121 + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ 131#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1)
122 & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ 132#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2)
123 + (((pte).pte_low >> PTE_FILE_SHIFT3) \ 133
124 << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) 134static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
125 135{
126#define pgoff_to_pte(off) \ 136 return (pgoff_t)
127 ((pte_t) { .pte_low = \ 137 (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) +
128 (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ 138 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) +
129 + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ 139 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3));
130 << PTE_FILE_SHIFT2) \ 140}
131 + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 141
132 << PTE_FILE_SHIFT3) \ 142static __always_inline pte_t pgoff_to_pte(pgoff_t off)
133 + _PAGE_FILE }) 143{
144 return (pte_t){
145 .pte_low =
146 pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) +
147 pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) +
148 pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) +
149 _PAGE_FILE,
150 };
151}
134 152
135#endif /* CONFIG_MEM_SOFT_DIRTY */ 153#endif /* CONFIG_MEM_SOFT_DIRTY */
136 154
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 2d883440cb9a..c883bf726398 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -58,7 +58,7 @@ typedef struct { pteval_t pte; } pte_t;
58#define VMALLOC_START _AC(0xffffc90000000000, UL) 58#define VMALLOC_START _AC(0xffffc90000000000, UL)
59#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) 59#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
60#define VMEMMAP_START _AC(0xffffea0000000000, UL) 60#define VMEMMAP_START _AC(0xffffea0000000000, UL)
61#define MODULES_VADDR _AC(0xffffffffa0000000, UL) 61#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
62#define MODULES_END _AC(0xffffffffff000000, UL) 62#define MODULES_END _AC(0xffffffffff000000, UL)
63#define MODULES_LEN (MODULES_END - MODULES_VADDR) 63#define MODULES_LEN (MODULES_END - MODULES_VADDR)
64 64
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 0ecac257fb26..a83aa44bb1fb 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -382,7 +382,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
382 */ 382 */
383extern pte_t *lookup_address(unsigned long address, unsigned int *level); 383extern pte_t *lookup_address(unsigned long address, unsigned int *level);
384extern phys_addr_t slow_virt_to_phys(void *__address); 384extern phys_addr_t slow_virt_to_phys(void *__address);
385 385extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
386 unsigned numpages, unsigned long page_flags);
386#endif /* !__ASSEMBLY__ */ 387#endif /* !__ASSEMBLY__ */
387 388
388#endif /* _ASM_X86_PGTABLE_DEFS_H */ 389#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4057f9..fdedd38fd0fc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,7 +27,6 @@ struct mm_struct;
27#include <linux/cache.h> 27#include <linux/cache.h>
28#include <linux/threads.h> 28#include <linux/threads.h>
29#include <linux/math64.h> 29#include <linux/math64.h>
30#include <linux/init.h>
31#include <linux/err.h> 30#include <linux/err.h>
32#include <linux/irqflags.h> 31#include <linux/irqflags.h>
33 32
@@ -72,6 +71,7 @@ extern u16 __read_mostly tlb_lli_4m[NR_INFO];
72extern u16 __read_mostly tlb_lld_4k[NR_INFO]; 71extern u16 __read_mostly tlb_lld_4k[NR_INFO];
73extern u16 __read_mostly tlb_lld_2m[NR_INFO]; 72extern u16 __read_mostly tlb_lld_2m[NR_INFO];
74extern u16 __read_mostly tlb_lld_4m[NR_INFO]; 73extern u16 __read_mostly tlb_lld_4m[NR_INFO];
74extern u16 __read_mostly tlb_lld_1g[NR_INFO];
75extern s8 __read_mostly tlb_flushall_shift; 75extern s8 __read_mostly tlb_flushall_shift;
76 76
77/* 77/*
@@ -370,6 +370,20 @@ struct ymmh_struct {
370 u32 ymmh_space[64]; 370 u32 ymmh_space[64];
371}; 371};
372 372
373/* We don't support LWP yet: */
374struct lwp_struct {
375 u8 reserved[128];
376};
377
378struct bndregs_struct {
379 u64 bndregs[8];
380} __packed;
381
382struct bndcsr_struct {
383 u64 cfg_reg_u;
384 u64 status_reg;
385} __packed;
386
373struct xsave_hdr_struct { 387struct xsave_hdr_struct {
374 u64 xstate_bv; 388 u64 xstate_bv;
375 u64 reserved1[2]; 389 u64 reserved1[2];
@@ -380,6 +394,9 @@ struct xsave_struct {
380 struct i387_fxsave_struct i387; 394 struct i387_fxsave_struct i387;
381 struct xsave_hdr_struct xsave_hdr; 395 struct xsave_hdr_struct xsave_hdr;
382 struct ymmh_struct ymmh; 396 struct ymmh_struct ymmh;
397 struct lwp_struct lwp;
398 struct bndregs_struct bndregs;
399 struct bndcsr_struct bndcsr;
383 /* new processor state extensions will go here */ 400 /* new processor state extensions will go here */
384} __attribute__ ((packed, aligned (64))); 401} __attribute__ ((packed, aligned (64)));
385 402
@@ -700,29 +717,6 @@ static inline void sync_core(void)
700#endif 717#endif
701} 718}
702 719
703static inline void __monitor(const void *eax, unsigned long ecx,
704 unsigned long edx)
705{
706 /* "monitor %eax, %ecx, %edx;" */
707 asm volatile(".byte 0x0f, 0x01, 0xc8;"
708 :: "a" (eax), "c" (ecx), "d"(edx));
709}
710
711static inline void __mwait(unsigned long eax, unsigned long ecx)
712{
713 /* "mwait %eax, %ecx;" */
714 asm volatile(".byte 0x0f, 0x01, 0xc9;"
715 :: "a" (eax), "c" (ecx));
716}
717
718static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
719{
720 trace_hardirqs_on();
721 /* "mwait %eax, %ecx;" */
722 asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
723 :: "a" (eax), "c" (ecx));
724}
725
726extern void select_idle_routine(const struct cpuinfo_x86 *c); 720extern void select_idle_routine(const struct cpuinfo_x86 *c);
727extern void init_amd_e400_c1e_mask(void); 721extern void init_amd_e400_c1e_mask(void);
728 722
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 942a08623a1a..14fd6fd75a19 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -60,7 +60,6 @@ struct pt_regs {
60 60
61#endif /* !__i386__ */ 61#endif /* !__i386__ */
62 62
63#include <linux/init.h>
64#ifdef CONFIG_PARAVIRT 63#ifdef CONFIG_PARAVIRT
65#include <asm/paravirt_types.h> 64#include <asm/paravirt_types.h>
66#endif 65#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 59bcf4e22418..d62c9f809bc5 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -3,7 +3,6 @@
3 3
4#include <uapi/asm/setup.h> 4#include <uapi/asm/setup.h>
5 5
6
7#define COMMAND_LINE_SIZE 2048 6#define COMMAND_LINE_SIZE 2048
8 7
9#include <linux/linkage.h> 8#include <linux/linkage.h>
@@ -29,6 +28,8 @@
29#include <asm/bootparam.h> 28#include <asm/bootparam.h>
30#include <asm/x86_init.h> 29#include <asm/x86_init.h>
31 30
31extern u64 relocated_ramdisk;
32
32/* Interrupt control for vSMPowered x86_64 systems */ 33/* Interrupt control for vSMPowered x86_64 systems */
33#ifdef CONFIG_X86_64 34#ifdef CONFIG_X86_64
34void vsmp_init(void); 35void vsmp_init(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4137890e88e3..8cd27e08e23c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -2,7 +2,6 @@
2#define _ASM_X86_SMP_H 2#define _ASM_X86_SMP_H
3#ifndef __ASSEMBLY__ 3#ifndef __ASSEMBLY__
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5#include <linux/init.h>
6#include <asm/percpu.h> 5#include <asm/percpu.h>
7 6
8/* 7/*
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 34baa0eb5d0c..a04eabd43d06 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -1,9 +1,9 @@
1#ifndef _ASM_X86_TIMER_H 1#ifndef _ASM_X86_TIMER_H
2#define _ASM_X86_TIMER_H 2#define _ASM_X86_TIMER_H
3#include <linux/init.h>
4#include <linux/pm.h> 3#include <linux/pm.h>
5#include <linux/percpu.h> 4#include <linux/percpu.h>
6#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/math64.h>
7 7
8#define TICK_SIZE (tick_nsec / 1000) 8#define TICK_SIZE (tick_nsec / 1000)
9 9
@@ -12,68 +12,26 @@ extern int recalibrate_cpu_khz(void);
12 12
13extern int no_timer_check; 13extern int no_timer_check;
14 14
15/* Accelerators for sched_clock() 15/*
16 * convert from cycles(64bits) => nanoseconds (64bits) 16 * We use the full linear equation: f(x) = a + b*x, in order to allow
17 * basic equation: 17 * a continuous function in the face of dynamic freq changes.
18 * ns = cycles / (freq / ns_per_sec)
19 * ns = cycles * (ns_per_sec / freq)
20 * ns = cycles * (10^9 / (cpu_khz * 10^3))
21 * ns = cycles * (10^6 / cpu_khz)
22 * 18 *
23 * Then we use scaling math (suggested by george@mvista.com) to get: 19 * Continuity means that when our frequency changes our slope (b); we want to
24 * ns = cycles * (10^6 * SC / cpu_khz) / SC 20 * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
25 * ns = cycles * cyc2ns_scale / SC
26 * 21 *
27 * And since SC is a constant power of two, we can convert the div 22 * Without an offset (a) the above would not be possible.
28 * into a shift.
29 * 23 *
30 * We can use khz divisor instead of mhz to keep a better precision, since 24 * See the comment near cycles_2_ns() for details on how we compute (b).
31 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
32 * (mathieu.desnoyers@polymtl.ca)
33 *
34 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
35 *
36 * In:
37 *
38 * ns = cycles * cyc2ns_scale / SC
39 *
40 * Although we may still have enough bits to store the value of ns,
41 * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
42 * leading to an incorrect result.
43 *
44 * To avoid this, we can decompose 'cycles' into quotient and remainder
45 * of division by SC. Then,
46 *
47 * ns = (quot * SC + rem) * cyc2ns_scale / SC
48 * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
49 *
50 * - sqazi@google.com
51 */ 25 */
52 26struct cyc2ns_data {
53DECLARE_PER_CPU(unsigned long, cyc2ns); 27 u32 cyc2ns_mul;
54DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); 28 u32 cyc2ns_shift;
55 29 u64 cyc2ns_offset;
56#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 30 u32 __count;
57 31 /* u32 hole */
58static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 32}; /* 24 bytes -- do not grow */
59{ 33
60 int cpu = smp_processor_id(); 34extern struct cyc2ns_data *cyc2ns_read_begin(void);
61 unsigned long long ns = per_cpu(cyc2ns_offset, cpu); 35extern void cyc2ns_read_end(struct cyc2ns_data *);
62 ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
63 (1UL << CYC2NS_SCALE_FACTOR));
64 return ns;
65}
66
67static inline unsigned long long cycles_2_ns(unsigned long long cyc)
68{
69 unsigned long long ns;
70 unsigned long flags;
71
72 local_irq_save(flags);
73 ns = __cycles_2_ns(cyc);
74 local_irq_restore(flags);
75
76 return ns;
77}
78 36
79#endif /* _ASM_X86_TIMER_H */ 37#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 235be70d5bb4..57ae63cd6ee2 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -65,4 +65,7 @@ extern int notsc_setup(char *);
65extern void tsc_save_sched_clock_state(void); 65extern void tsc_save_sched_clock_state(void);
66extern void tsc_restore_sched_clock_state(void); 66extern void tsc_restore_sched_clock_state(void);
67 67
68/* MSR based TSC calibration for Intel Atom SoC platforms */
69int try_msr_calibrate_tsc(unsigned long *fast_calibrate);
70
68#endif /* _ASM_X86_TSC_H */ 71#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8ec57c07b125..0d592e0a5b84 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -40,22 +40,30 @@
40/* 40/*
41 * Test whether a block of memory is a valid user space address. 41 * Test whether a block of memory is a valid user space address.
42 * Returns 0 if the range is valid, nonzero otherwise. 42 * Returns 0 if the range is valid, nonzero otherwise.
43 *
44 * This is equivalent to the following test:
45 * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
46 *
47 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
48 */ 43 */
44static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
45{
46 /*
47 * If we have used "sizeof()" for the size,
48 * we know it won't overflow the limit (but
49 * it might overflow the 'addr', so it's
50 * important to subtract the size from the
51 * limit, not add it to the address).
52 */
53 if (__builtin_constant_p(size))
54 return addr > limit - size;
55
56 /* Arbitrary sizes? Be careful about overflow */
57 addr += size;
58 if (addr < size)
59 return true;
60 return addr > limit;
61}
49 62
50#define __range_not_ok(addr, size, limit) \ 63#define __range_not_ok(addr, size, limit) \
51({ \ 64({ \
52 unsigned long flag, roksum; \
53 __chk_user_ptr(addr); \ 65 __chk_user_ptr(addr); \
54 asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ 66 __chk_range_not_ok((unsigned long __force)(addr), size, limit); \
55 : "=&r" (flag), "=r" (roksum) \
56 : "1" (addr), "g" ((long)(size)), \
57 "rm" (limit)); \
58 flag; \
59}) 67})
60 68
61/** 69/**
@@ -78,7 +86,7 @@
78 * this function, memory access functions may still return -EFAULT. 86 * this function, memory access functions may still return -EFAULT.
79 */ 87 */
80#define access_ok(type, addr, size) \ 88#define access_ok(type, addr, size) \
81 (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) 89 likely(!__range_not_ok(addr, size, user_addr_max()))
82 90
83/* 91/*
84 * The exception table consists of pairs of addresses relative to the 92 * The exception table consists of pairs of addresses relative to the
@@ -525,6 +533,98 @@ extern __must_check long strnlen_user(const char __user *str, long n);
525unsigned long __must_check clear_user(void __user *mem, unsigned long len); 533unsigned long __must_check clear_user(void __user *mem, unsigned long len);
526unsigned long __must_check __clear_user(void __user *mem, unsigned long len); 534unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
527 535
536extern void __cmpxchg_wrong_size(void)
537 __compiletime_error("Bad argument size for cmpxchg");
538
539#define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \
540({ \
541 int __ret = 0; \
542 __typeof__(ptr) __uval = (uval); \
543 __typeof__(*(ptr)) __old = (old); \
544 __typeof__(*(ptr)) __new = (new); \
545 switch (size) { \
546 case 1: \
547 { \
548 asm volatile("\t" ASM_STAC "\n" \
549 "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \
550 "2:\t" ASM_CLAC "\n" \
551 "\t.section .fixup, \"ax\"\n" \
552 "3:\tmov %3, %0\n" \
553 "\tjmp 2b\n" \
554 "\t.previous\n" \
555 _ASM_EXTABLE(1b, 3b) \
556 : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
557 : "i" (-EFAULT), "q" (__new), "1" (__old) \
558 : "memory" \
559 ); \
560 break; \
561 } \
562 case 2: \
563 { \
564 asm volatile("\t" ASM_STAC "\n" \
565 "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \
566 "2:\t" ASM_CLAC "\n" \
567 "\t.section .fixup, \"ax\"\n" \
568 "3:\tmov %3, %0\n" \
569 "\tjmp 2b\n" \
570 "\t.previous\n" \
571 _ASM_EXTABLE(1b, 3b) \
572 : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
573 : "i" (-EFAULT), "r" (__new), "1" (__old) \
574 : "memory" \
575 ); \
576 break; \
577 } \
578 case 4: \
579 { \
580 asm volatile("\t" ASM_STAC "\n" \
581 "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \
582 "2:\t" ASM_CLAC "\n" \
583 "\t.section .fixup, \"ax\"\n" \
584 "3:\tmov %3, %0\n" \
585 "\tjmp 2b\n" \
586 "\t.previous\n" \
587 _ASM_EXTABLE(1b, 3b) \
588 : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
589 : "i" (-EFAULT), "r" (__new), "1" (__old) \
590 : "memory" \
591 ); \
592 break; \
593 } \
594 case 8: \
595 { \
596 if (!IS_ENABLED(CONFIG_X86_64)) \
597 __cmpxchg_wrong_size(); \
598 \
599 asm volatile("\t" ASM_STAC "\n" \
600 "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \
601 "2:\t" ASM_CLAC "\n" \
602 "\t.section .fixup, \"ax\"\n" \
603 "3:\tmov %3, %0\n" \
604 "\tjmp 2b\n" \
605 "\t.previous\n" \
606 _ASM_EXTABLE(1b, 3b) \
607 : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
608 : "i" (-EFAULT), "r" (__new), "1" (__old) \
609 : "memory" \
610 ); \
611 break; \
612 } \
613 default: \
614 __cmpxchg_wrong_size(); \
615 } \
616 *__uval = __old; \
617 __ret; \
618})
619
620#define user_atomic_cmpxchg_inatomic(uval, ptr, old, new) \
621({ \
622 access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ? \
623 __user_atomic_cmpxchg_inatomic((uval), (ptr), \
624 (old), (new), sizeof(*(ptr))) : \
625 -EFAULT; \
626})
627
528/* 628/*
529 * movsl can be slow when source and dest are not both 8-byte aligned 629 * movsl can be slow when source and dest are not both 8-byte aligned
530 */ 630 */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 190413d0de57..12a26b979bf1 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -204,13 +204,13 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
204static __must_check __always_inline int 204static __must_check __always_inline int
205__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) 205__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
206{ 206{
207 return __copy_from_user_nocheck(dst, (__force const void *)src, size); 207 return __copy_from_user_nocheck(dst, src, size);
208} 208}
209 209
210static __must_check __always_inline int 210static __must_check __always_inline int
211__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) 211__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
212{ 212{
213 return __copy_to_user_nocheck((__force void *)dst, src, size); 213 return __copy_to_user_nocheck(dst, src, size);
214} 214}
215 215
216extern long __copy_user_nocache(void *dst, const void __user *src, 216extern long __copy_user_nocache(void *dst, const void __user *src,
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 966502d4682e..2067264fb7f5 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -100,6 +100,7 @@
100 100
101#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f 101#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
102#define VMX_MISC_SAVE_EFER_LMA 0x00000020 102#define VMX_MISC_SAVE_EFER_LMA 0x00000020
103#define VMX_MISC_ACTIVITY_HLT 0x00000040
103 104
104/* VMCS Encodings */ 105/* VMCS Encodings */
105enum vmcs_field { 106enum vmcs_field {
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 0f1be11e43d2..e45e4da96bf1 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -181,7 +181,7 @@ struct x86_msi_ops {
181 u8 hpet_id); 181 u8 hpet_id);
182 void (*teardown_msi_irq)(unsigned int irq); 182 void (*teardown_msi_irq)(unsigned int irq);
183 void (*teardown_msi_irqs)(struct pci_dev *dev); 183 void (*teardown_msi_irqs)(struct pci_dev *dev);
184 void (*restore_msi_irqs)(struct pci_dev *dev, int irq); 184 void (*restore_msi_irqs)(struct pci_dev *dev);
185 int (*setup_hpet_msi)(unsigned int irq, unsigned int id); 185 int (*setup_hpet_msi)(unsigned int irq, unsigned int id);
186 u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag); 186 u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag);
187 u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag); 187 u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag);
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index b913915e8e63..3e276eb23d1b 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -167,7 +167,12 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
167 */ 167 */
168static inline unsigned long mfn_to_local_pfn(unsigned long mfn) 168static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
169{ 169{
170 unsigned long pfn = mfn_to_pfn(mfn); 170 unsigned long pfn;
171
172 if (xen_feature(XENFEAT_auto_translated_physmap))
173 return mfn;
174
175 pfn = mfn_to_pfn(mfn);
171 if (get_phys_to_machine(pfn) != mfn) 176 if (get_phys_to_machine(pfn) != mfn)
172 return -1; /* force !pfn_valid() */ 177 return -1; /* force !pfn_valid() */
173 return pfn; 178 return pfn;
@@ -222,5 +227,6 @@ void make_lowmem_page_readonly(void *vaddr);
222void make_lowmem_page_readwrite(void *vaddr); 227void make_lowmem_page_readwrite(void *vaddr);
223 228
224#define xen_remap(cookie, size) ioremap((cookie), (size)); 229#define xen_remap(cookie, size) ioremap((cookie), (size));
230#define xen_unmap(cookie) iounmap((cookie))
225 231
226#endif /* _ASM_X86_XEN_PAGE_H */ 232#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 0415cdabb5a6..554738963b28 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -9,6 +9,8 @@
9#define XSTATE_FP 0x1 9#define XSTATE_FP 0x1
10#define XSTATE_SSE 0x2 10#define XSTATE_SSE 0x2
11#define XSTATE_YMM 0x4 11#define XSTATE_YMM 0x4
12#define XSTATE_BNDREGS 0x8
13#define XSTATE_BNDCSR 0x10
12 14
13#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) 15#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
14 16
@@ -20,10 +22,14 @@
20#define XSAVE_YMM_SIZE 256 22#define XSAVE_YMM_SIZE 256
21#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) 23#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
22 24
23/* 25/* Supported features which support lazy state saving */
24 * These are the features that the OS can handle currently. 26#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
25 */ 27
26#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) 28/* Supported features which require eager state saving */
29#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR)
30
31/* All currently supported features */
32#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER)
27 33
28#ifdef CONFIG_X86_64 34#ifdef CONFIG_X86_64
29#define REX_PREFIX "0x48, " 35#define REX_PREFIX "0x48, "
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 9c3733c5f8f7..225b0988043a 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -6,6 +6,7 @@
6#define SETUP_E820_EXT 1 6#define SETUP_E820_EXT 1
7#define SETUP_DTB 2 7#define SETUP_DTB 2
8#define SETUP_PCI 3 8#define SETUP_PCI 3
9#define SETUP_EFI 4
9 10
10/* ram_size flags */ 11/* ram_size flags */
11#define RAMDISK_IMAGE_START_MASK 0x07FF 12#define RAMDISK_IMAGE_START_MASK 0x07FF
@@ -23,6 +24,7 @@
23#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) 24#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1)
24#define XLF_EFI_HANDOVER_32 (1<<2) 25#define XLF_EFI_HANDOVER_32 (1<<2)
25#define XLF_EFI_HANDOVER_64 (1<<3) 26#define XLF_EFI_HANDOVER_64 (1<<3)
27#define XLF_EFI_KEXEC (1<<4)
26 28
27#ifndef __ASSEMBLY__ 29#ifndef __ASSEMBLY__
28 30
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index b8f1c0176cbc..462efe746d77 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -28,6 +28,9 @@
28/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ 28/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
29#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) 29#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
30 30
31/* A partition's reference time stamp counter (TSC) page */
32#define HV_X64_MSR_REFERENCE_TSC 0x40000021
33
31/* 34/*
32 * There is a single feature flag that signifies the presence of the MSR 35 * There is a single feature flag that signifies the presence of the MSR
33 * that can be used to retrieve both the local APIC Timer frequency as 36 * that can be used to retrieve both the local APIC Timer frequency as
@@ -198,6 +201,9 @@
198#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ 201#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
199 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) 202 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
200 203
204#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
205#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
206
201#define HV_PROCESSOR_POWER_STATE_C0 0 207#define HV_PROCESSOR_POWER_STATE_C0 0
202#define HV_PROCESSOR_POWER_STATE_C1 1 208#define HV_PROCESSOR_POWER_STATE_C1 1
203#define HV_PROCESSOR_POWER_STATE_C2 2 209#define HV_PROCESSOR_POWER_STATE_C2 2
@@ -210,4 +216,11 @@
210#define HV_STATUS_INVALID_ALIGNMENT 4 216#define HV_STATUS_INVALID_ALIGNMENT 4
211#define HV_STATUS_INSUFFICIENT_BUFFERS 19 217#define HV_STATUS_INSUFFICIENT_BUFFERS 19
212 218
219typedef struct _HV_REFERENCE_TSC_PAGE {
220 __u32 tsc_sequence;
221 __u32 res1;
222 __u64 tsc_scale;
223 __s64 tsc_offset;
224} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
225
213#endif 226#endif
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 37813b5ddc37..c19fc60ff062 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -184,6 +184,7 @@
184#define MSR_AMD64_PATCH_LOADER 0xc0010020 184#define MSR_AMD64_PATCH_LOADER 0xc0010020
185#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 185#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
186#define MSR_AMD64_OSVW_STATUS 0xc0010141 186#define MSR_AMD64_OSVW_STATUS 0xc0010141
187#define MSR_AMD64_LS_CFG 0xc0011020
187#define MSR_AMD64_DC_CFG 0xc0011022 188#define MSR_AMD64_DC_CFG 0xc0011022
188#define MSR_AMD64_BU_CFG2 0xc001102a 189#define MSR_AMD64_BU_CFG2 0xc001102a
189#define MSR_AMD64_IBSFETCHCTL 0xc0011030 190#define MSR_AMD64_IBSFETCHCTL 0xc0011030
@@ -527,6 +528,7 @@
527#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e 528#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
528#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f 529#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
529#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 530#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
531#define MSR_IA32_VMX_VMFUNC 0x00000491
530 532
531/* VMX_BASIC bits and bitmasks */ 533/* VMX_BASIC bits and bitmasks */
532#define VMX_BASIC_VMCS_SIZE_SHIFT 32 534#define VMX_BASIC_VMCS_SIZE_SHIFT 32
diff --git a/arch/x86/include/uapi/asm/stat.h b/arch/x86/include/uapi/asm/stat.h
index 7b3ddc348585..bc03eb5d6360 100644
--- a/arch/x86/include/uapi/asm/stat.h
+++ b/arch/x86/include/uapi/asm/stat.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_STAT_H 1#ifndef _ASM_X86_STAT_H
2#define _ASM_X86_STAT_H 2#define _ASM_X86_STAT_H
3 3
4#include <asm/posix_types.h>
5
4#define STAT_HAVE_NSEC 1 6#define STAT_HAVE_NSEC 1
5 7
6#ifdef __i386__ 8#ifdef __i386__
@@ -78,26 +80,26 @@ struct stat64 {
78#else /* __i386__ */ 80#else /* __i386__ */
79 81
80struct stat { 82struct stat {
81 unsigned long st_dev; 83 __kernel_ulong_t st_dev;
82 unsigned long st_ino; 84 __kernel_ulong_t st_ino;
83 unsigned long st_nlink; 85 __kernel_ulong_t st_nlink;
84 86
85 unsigned int st_mode; 87 unsigned int st_mode;
86 unsigned int st_uid; 88 unsigned int st_uid;
87 unsigned int st_gid; 89 unsigned int st_gid;
88 unsigned int __pad0; 90 unsigned int __pad0;
89 unsigned long st_rdev; 91 __kernel_ulong_t st_rdev;
90 long st_size; 92 __kernel_long_t st_size;
91 long st_blksize; 93 __kernel_long_t st_blksize;
92 long st_blocks; /* Number 512-byte blocks allocated. */ 94 __kernel_long_t st_blocks; /* Number 512-byte blocks allocated. */
93 95
94 unsigned long st_atime; 96 __kernel_ulong_t st_atime;
95 unsigned long st_atime_nsec; 97 __kernel_ulong_t st_atime_nsec;
96 unsigned long st_mtime; 98 __kernel_ulong_t st_mtime;
97 unsigned long st_mtime_nsec; 99 __kernel_ulong_t st_mtime_nsec;
98 unsigned long st_ctime; 100 __kernel_ulong_t st_ctime;
99 unsigned long st_ctime_nsec; 101 __kernel_ulong_t st_ctime_nsec;
100 long __unused[3]; 102 __kernel_long_t __unused[3];
101}; 103};
102 104
103/* We don't need to memset the whole thing just to initialize the padding */ 105/* We don't need to memset the whole thing just to initialize the padding */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9b0a34e2cd79..cb648c84b327 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,10 +29,11 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
29obj-y += syscall_$(BITS).o 29obj-y += syscall_$(BITS).o
30obj-$(CONFIG_X86_64) += vsyscall_64.o 30obj-$(CONFIG_X86_64) += vsyscall_64.o
31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
32obj-$(CONFIG_SYSFS) += ksysfs.o
32obj-y += bootflag.o e820.o 33obj-y += bootflag.o e820.o
33obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 34obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
34obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 35obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
35obj-y += tsc.o io_delay.o rtc.o 36obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
36obj-y += pci-iommu_table.o 37obj-y += pci-iommu_table.o
37obj-y += resource.o 38obj-y += resource.o
38 39
@@ -91,15 +92,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
91 92
92obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 93obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
93 94
94obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o
95obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o
96obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o
97microcode-y := microcode_core.o
98microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
99microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
100obj-$(CONFIG_MICROCODE_AMD_EARLY) += microcode_amd_early.o
101obj-$(CONFIG_MICROCODE) += microcode.o
102
103obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 95obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
104 96
105obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 97obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
@@ -111,6 +103,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
111 103
112obj-$(CONFIG_PERF_EVENTS) += perf_regs.o 104obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
113obj-$(CONFIG_TRACING) += tracepoint.o 105obj-$(CONFIG_TRACING) += tracepoint.o
106obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
114 107
115### 108###
116# 64 bit specific files 109# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0b0b91b83d51..1dac94265b59 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1033,9 +1033,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1033 1033
1034 if (!acpi_ioapic) 1034 if (!acpi_ioapic)
1035 return 0; 1035 return 0;
1036 if (!dev) 1036 if (!dev || !dev_is_pci(dev))
1037 return 0;
1038 if (dev->bus != &pci_bus_type)
1039 return 0; 1037 return 0;
1040 1038
1041 pdev = to_pci_dev(dev); 1039 pdev = to_pci_dev(dev);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27781bc..e69182fd01cf 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
150} 150}
151EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 151EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
152 152
153/*
154 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
155 * which can obviate IPI to trigger checking of need_resched.
156 * We execute MONITOR against need_resched and enter optimized wait state
157 * through MWAIT. Whenever someone changes need_resched, we would be woken
158 * up from MWAIT (without an IPI).
159 *
160 * New with Core Duo processors, MWAIT can take some hints based on CPU
161 * capability.
162 */
163void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
164{
165 if (!need_resched()) {
166 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
167 clflush((void *)&current_thread_info()->flags);
168
169 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb();
171 if (!need_resched())
172 __mwait(ax, cx);
173 }
174}
175
176void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) 153void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
177{ 154{
178 unsigned int cpu = smp_processor_id(); 155 unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d278736bf774..7f26c9a70a9e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,6 +75,13 @@ unsigned int max_physical_apicid;
75physid_mask_t phys_cpu_present_map; 75physid_mask_t phys_cpu_present_map;
76 76
77/* 77/*
78 * Processor to be disabled specified by kernel parameter
79 * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
80 * avoid undefined behaviour caused by sending INIT from AP to BSP.
81 */
82static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
83
84/*
78 * Map cpu index to physical APIC ID 85 * Map cpu index to physical APIC ID
79 */ 86 */
80DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); 87DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -1968,7 +1975,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
1968 */ 1975 */
1969static inline void __smp_error_interrupt(struct pt_regs *regs) 1976static inline void __smp_error_interrupt(struct pt_regs *regs)
1970{ 1977{
1971 u32 v0, v1; 1978 u32 v;
1972 u32 i = 0; 1979 u32 i = 0;
1973 static const char * const error_interrupt_reason[] = { 1980 static const char * const error_interrupt_reason[] = {
1974 "Send CS error", /* APIC Error Bit 0 */ 1981 "Send CS error", /* APIC Error Bit 0 */
@@ -1982,21 +1989,20 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
1982 }; 1989 };
1983 1990
1984 /* First tickle the hardware, only then report what went on. -- REW */ 1991 /* First tickle the hardware, only then report what went on. -- REW */
1985 v0 = apic_read(APIC_ESR);
1986 apic_write(APIC_ESR, 0); 1992 apic_write(APIC_ESR, 0);
1987 v1 = apic_read(APIC_ESR); 1993 v = apic_read(APIC_ESR);
1988 ack_APIC_irq(); 1994 ack_APIC_irq();
1989 atomic_inc(&irq_err_count); 1995 atomic_inc(&irq_err_count);
1990 1996
1991 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", 1997 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
1992 smp_processor_id(), v0 , v1); 1998 smp_processor_id(), v);
1993 1999
1994 v1 = v1 & 0xff; 2000 v &= 0xff;
1995 while (v1) { 2001 while (v) {
1996 if (v1 & 0x1) 2002 if (v & 0x1)
1997 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); 2003 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1998 i++; 2004 i++;
1999 v1 >>= 1; 2005 v >>= 1;
2000 } 2006 }
2001 2007
2002 apic_printk(APIC_DEBUG, KERN_CONT "\n"); 2008 apic_printk(APIC_DEBUG, KERN_CONT "\n");
@@ -2115,6 +2121,39 @@ int generic_processor_info(int apicid, int version)
2115 phys_cpu_present_map); 2121 phys_cpu_present_map);
2116 2122
2117 /* 2123 /*
2124 * boot_cpu_physical_apicid is designed to have the apicid
2125 * returned by read_apic_id(), i.e, the apicid of the
2126 * currently booting-up processor. However, on some platforms,
2127 * it is temporarily modified by the apicid reported as BSP
2128 * through MP table. Concretely:
2129 *
2130 * - arch/x86/kernel/mpparse.c: MP_processor_info()
2131 * - arch/x86/mm/amdtopology.c: amd_numa_init()
2132 * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info()
2133 *
2134 * This function is executed with the modified
2135 * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
2136 * parameter doesn't work to disable APs on kdump 2nd kernel.
2137 *
2138 * Since fixing handling of boot_cpu_physical_apicid requires
2139 * another discussion and tests on each platform, we leave it
2140 * for now and here we use read_apic_id() directly in this
2141 * function, generic_processor_info().
2142 */
2143 if (disabled_cpu_apicid != BAD_APICID &&
2144 disabled_cpu_apicid != read_apic_id() &&
2145 disabled_cpu_apicid == apicid) {
2146 int thiscpu = num_processors + disabled_cpus;
2147
2148 pr_warning("APIC: Disabling requested cpu."
2149 " Processor %d/0x%x ignored.\n",
2150 thiscpu, apicid);
2151
2152 disabled_cpus++;
2153 return -ENODEV;
2154 }
2155
2156 /*
2118 * If boot cpu has not been detected yet, then only allow upto 2157 * If boot cpu has not been detected yet, then only allow upto
2119 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu 2158 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
2120 */ 2159 */
@@ -2592,3 +2631,12 @@ static int __init lapic_insert_resource(void)
2592 * that is using request_resource 2631 * that is using request_resource
2593 */ 2632 */
2594late_initcall(lapic_insert_resource); 2633late_initcall(lapic_insert_resource);
2634
2635static int __init apic_set_disabled_cpu_apicid(char *arg)
2636{
2637 if (!arg || !get_option(&arg, &disabled_cpu_apicid))
2638 return -EINVAL;
2639
2640 return 0;
2641}
2642early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index ccbf857d1d55..2c621a6b901a 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -14,7 +14,6 @@
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/hardirq.h> 17#include <linux/hardirq.h>
19#include <linux/module.h> 18#include <linux/module.h>
20#include <asm/smp.h> 19#include <asm/smp.h>
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e145f28b4099..191ce75c0e54 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -15,7 +15,6 @@
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h> 18#include <linux/errno.h>
20#include <asm/fixmap.h> 19#include <asm/fixmap.h>
21#include <asm/mpspec.h> 20#include <asm/mpspec.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4d67a7531d45..6ad4658de705 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1139,9 +1139,10 @@ next:
1139 if (test_bit(vector, used_vectors)) 1139 if (test_bit(vector, used_vectors))
1140 goto next; 1140 goto next;
1141 1141
1142 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) 1142 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
1143 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1143 if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
1144 goto next; 1144 goto next;
1145 }
1145 /* Found one! */ 1146 /* Found one! */
1146 current_vector = vector; 1147 current_vector = vector;
1147 current_offset = offset; 1148 current_offset = offset;
@@ -1180,7 +1181,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1180 1181
1181 vector = cfg->vector; 1182 vector = cfg->vector;
1182 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) 1183 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
1183 per_cpu(vector_irq, cpu)[vector] = -1; 1184 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1184 1185
1185 cfg->vector = 0; 1186 cfg->vector = 0;
1186 cpumask_clear(cfg->domain); 1187 cpumask_clear(cfg->domain);
@@ -1188,11 +1189,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1188 if (likely(!cfg->move_in_progress)) 1189 if (likely(!cfg->move_in_progress))
1189 return; 1190 return;
1190 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { 1191 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
1191 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; 1192 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
1192 vector++) {
1193 if (per_cpu(vector_irq, cpu)[vector] != irq) 1193 if (per_cpu(vector_irq, cpu)[vector] != irq)
1194 continue; 1194 continue;
1195 per_cpu(vector_irq, cpu)[vector] = -1; 1195 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1196 break; 1196 break;
1197 } 1197 }
1198 } 1198 }
@@ -1225,12 +1225,12 @@ void __setup_vector_irq(int cpu)
1225 /* Mark the free vectors */ 1225 /* Mark the free vectors */
1226 for (vector = 0; vector < NR_VECTORS; ++vector) { 1226 for (vector = 0; vector < NR_VECTORS; ++vector) {
1227 irq = per_cpu(vector_irq, cpu)[vector]; 1227 irq = per_cpu(vector_irq, cpu)[vector];
1228 if (irq < 0) 1228 if (irq <= VECTOR_UNDEFINED)
1229 continue; 1229 continue;
1230 1230
1231 cfg = irq_cfg(irq); 1231 cfg = irq_cfg(irq);
1232 if (!cpumask_test_cpu(cpu, cfg->domain)) 1232 if (!cpumask_test_cpu(cpu, cfg->domain))
1233 per_cpu(vector_irq, cpu)[vector] = -1; 1233 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1234 } 1234 }
1235 raw_spin_unlock(&vector_lock); 1235 raw_spin_unlock(&vector_lock);
1236} 1236}
@@ -2199,13 +2199,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2199 2199
2200 me = smp_processor_id(); 2200 me = smp_processor_id();
2201 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2201 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
2202 unsigned int irq; 2202 int irq;
2203 unsigned int irr; 2203 unsigned int irr;
2204 struct irq_desc *desc; 2204 struct irq_desc *desc;
2205 struct irq_cfg *cfg; 2205 struct irq_cfg *cfg;
2206 irq = __this_cpu_read(vector_irq[vector]); 2206 irq = __this_cpu_read(vector_irq[vector]);
2207 2207
2208 if (irq == -1) 2208 if (irq <= VECTOR_UNDEFINED)
2209 continue; 2209 continue;
2210 2210
2211 desc = irq_to_desc(irq); 2211 desc = irq_to_desc(irq);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 7434d8556d09..62071569bd50 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/interrupt.h> 2#include <linux/interrupt.h>
3#include <linux/init.h>
4 3
5#include <linux/mm.h> 4#include <linux/mm.h>
6#include <linux/delay.h> 5#include <linux/delay.h>
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 77c95c0e1bf7..00146f9b0254 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -29,7 +29,6 @@
29#define pr_fmt(fmt) "summit: %s: " fmt, __func__ 29#define pr_fmt(fmt) "summit: %s: " fmt, __func__
30 30
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/init.h>
33#include <asm/io.h> 32#include <asm/io.h>
34#include <asm/bios_ebda.h> 33#include <asm/bios_ebda.h>
35 34
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 140e29db478d..cac85ee6913f 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -3,7 +3,6 @@
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h> 6#include <linux/dmar.h>
8#include <linux/cpu.h> 7#include <linux/cpu.h>
9 8
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 562a76d433c8..de231e328cae 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -3,7 +3,6 @@
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h> 6#include <linux/dmar.h>
8 7
9#include <asm/smp.h> 8#include <asm/smp.h>
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabdd..83a7995625a6 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)
91 91
92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
93 93
94 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 94 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), 95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
96 PAGE_SIZE, corruption_check_size); 96 PAGE_SIZE, corruption_check_size);
97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), 97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7e99cb..7fd54f09b011 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -36,12 +36,13 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
36endif 36endif
37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o 37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
38obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 38obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
39obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o 39obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o
40endif 40endif
41 41
42 42
43obj-$(CONFIG_X86_MCE) += mcheck/ 43obj-$(CONFIG_X86_MCE) += mcheck/
44obj-$(CONFIG_MTRR) += mtrr/ 44obj-$(CONFIG_MTRR) += mtrr/
45obj-$(CONFIG_MICROCODE) += microcode/
45 46
46obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o 47obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
47 48
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bca023bdd6b2..d3153e281d72 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,5 +1,4 @@
1#include <linux/export.h> 1#include <linux/export.h>
2#include <linux/init.h>
3#include <linux/bitops.h> 2#include <linux/bitops.h>
4#include <linux/elf.h> 3#include <linux/elf.h>
5#include <linux/mm.h> 4#include <linux/mm.h>
@@ -487,7 +486,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
487 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 486 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
488 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 487 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
489 if (!check_tsc_unstable()) 488 if (!check_tsc_unstable())
490 sched_clock_stable = 1; 489 set_sched_clock_stable();
491 } 490 }
492 491
493#ifdef CONFIG_X86_64 492#ifdef CONFIG_X86_64
@@ -508,6 +507,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
508 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 507 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
509 } 508 }
510#endif 509#endif
510
511 /* F16h erratum 793, CVE-2013-6885 */
512 if (c->x86 == 0x16 && c->x86_model <= 0xf) {
513 u64 val;
514
515 rdmsrl(MSR_AMD64_LS_CFG, val);
516 if (!(val & BIT(15)))
517 wrmsrl(MSR_AMD64_LS_CFG, val | BIT(15));
518 }
519
511} 520}
512 521
513static const int amd_erratum_383[]; 522static const int amd_erratum_383[];
@@ -790,14 +799,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
790 } 799 }
791 800
792 /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ 801 /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
793 if (!((eax >> 16) & mask)) { 802 if (!((eax >> 16) & mask))
794 u32 a, b, c, d; 803 tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
795 804 else
796 cpuid(0x80000005, &a, &b, &c, &d);
797 tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
798 } else {
799 tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; 805 tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
800 }
801 806
802 /* a 4M entry uses two 2M entries */ 807 /* a 4M entry uses two 2M entries */
803 tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; 808 tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 8d5652dc99dd..8779edab684e 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,6 +1,5 @@
1#include <linux/bitops.h> 1#include <linux/bitops.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/init.h>
4 3
5#include <asm/processor.h> 4#include <asm/processor.h>
6#include <asm/e820.h> 5#include <asm/e820.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6abc172b8258..24b6fd10625a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -472,6 +472,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO];
472u16 __read_mostly tlb_lld_4k[NR_INFO]; 472u16 __read_mostly tlb_lld_4k[NR_INFO];
473u16 __read_mostly tlb_lld_2m[NR_INFO]; 473u16 __read_mostly tlb_lld_2m[NR_INFO];
474u16 __read_mostly tlb_lld_4m[NR_INFO]; 474u16 __read_mostly tlb_lld_4m[NR_INFO];
475u16 __read_mostly tlb_lld_1g[NR_INFO];
475 476
476/* 477/*
477 * tlb_flushall_shift shows the balance point in replacing cr3 write 478 * tlb_flushall_shift shows the balance point in replacing cr3 write
@@ -486,13 +487,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c)
486 if (this_cpu->c_detect_tlb) 487 if (this_cpu->c_detect_tlb)
487 this_cpu->c_detect_tlb(c); 488 this_cpu->c_detect_tlb(c);
488 489
489 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ 490 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
490 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ 491 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
491 "tlb_flushall_shift: %d\n", 492 "tlb_flushall_shift: %d\n",
492 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], 493 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
493 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], 494 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
494 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], 495 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
495 tlb_flushall_shift); 496 tlb_lld_1g[ENTRIES], tlb_flushall_shift);
496} 497}
497 498
498void detect_ht(struct cpuinfo_x86 *c) 499void detect_ht(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index d0969c75ab54..aaf152e79637 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,4 +1,3 @@
1#include <linux/init.h>
2#include <linux/bitops.h> 1#include <linux/bitops.h>
3#include <linux/delay.h> 2#include <linux/delay.h>
4#include <linux/pci.h> 3#include <linux/pci.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ea04b342c026..3db61c644e44 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,4 +1,3 @@
1#include <linux/init.h>
2#include <linux/kernel.h> 1#include <linux/kernel.h>
3 2
4#include <linux/string.h> 3#include <linux/string.h>
@@ -93,7 +92,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
93 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 92 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
94 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 93 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
95 if (!check_tsc_unstable()) 94 if (!check_tsc_unstable())
96 sched_clock_stable = 1; 95 set_sched_clock_stable();
97 } 96 }
98 97
99 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ 98 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -506,6 +505,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
506#define TLB_DATA0_2M_4M 0x23 505#define TLB_DATA0_2M_4M 0x23
507 506
508#define STLB_4K 0x41 507#define STLB_4K 0x41
508#define STLB_4K_2M 0x42
509 509
510static const struct _tlb_table intel_tlb_table[] = { 510static const struct _tlb_table intel_tlb_table[] = {
511 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, 511 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
@@ -526,13 +526,20 @@ static const struct _tlb_table intel_tlb_table[] = {
526 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, 526 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
527 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, 527 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
528 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, 528 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
529 { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
530 { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
531 { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
529 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, 532 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
530 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, 533 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
531 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, 534 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
532 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, 535 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
533 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, 536 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
537 { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" },
538 { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" },
534 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, 539 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
535 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, 540 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
541 { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
542 { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
536 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, 543 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
537 { 0x00, 0, 0 } 544 { 0x00, 0, 0 }
538}; 545};
@@ -558,6 +565,20 @@ static void intel_tlb_lookup(const unsigned char desc)
558 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) 565 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
559 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; 566 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
560 break; 567 break;
568 case STLB_4K_2M:
569 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
570 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
571 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
572 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
573 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
574 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
575 if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
576 tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
577 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
578 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
579 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
580 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
581 break;
561 case TLB_INST_ALL: 582 case TLB_INST_ALL:
562 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) 583 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
563 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; 584 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
@@ -603,6 +624,10 @@ static void intel_tlb_lookup(const unsigned char desc)
603 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) 624 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
604 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; 625 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
605 break; 626 break;
627 case TLB_DATA_1G:
628 if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
629 tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
630 break;
606 } 631 }
607} 632}
608 633
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index de8b60a53f69..a1aef9533154 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,22 +33,28 @@
33#include <linux/acpi.h> 33#include <linux/acpi.h>
34#include <linux/cper.h> 34#include <linux/cper.h>
35#include <acpi/apei.h> 35#include <acpi/apei.h>
36#include <acpi/ghes.h>
36#include <asm/mce.h> 37#include <asm/mce.h>
37 38
38#include "mce-internal.h" 39#include "mce-internal.h"
39 40
40void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) 41void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
41{ 42{
42 struct mce m; 43 struct mce m;
43 44
44 /* Only corrected MC is reported */ 45 if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
45 if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
46 return; 46 return;
47 47
48 mce_setup(&m); 48 mce_setup(&m);
49 m.bank = 1; 49 m.bank = 1;
50 /* Fake a memory read corrected error with unknown channel */ 50 /* Fake a memory read error with unknown channel */
51 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; 51 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
52
53 if (severity >= GHES_SEV_RECOVERABLE)
54 m.status |= MCI_STATUS_UC;
55 if (severity >= GHES_SEV_PANIC)
56 m.status |= MCI_STATUS_PCC;
57
52 m.addr = mem_err->physical_addr; 58 m.addr = mem_err->physical_addr;
53 mce_log(&m); 59 mce_log(&m);
54 mce_notify_irq(); 60 mce_notify_irq();
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b3218cdee95f..4d5419b249da 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1638 1638
1639static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1639static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1640{ 1640{
1641 unsigned long iv = mce_adjust_timer(check_interval * HZ); 1641 unsigned long iv = check_interval * HZ;
1642
1643 __this_cpu_write(mce_next_interval, iv);
1644 1642
1645 if (mca_cfg.ignore_ce || !iv) 1643 if (mca_cfg.ignore_ce || !iv)
1646 return; 1644 return;
1647 1645
1646 per_cpu(mce_next_interval, cpu) = iv;
1647
1648 t->expires = round_jiffies(jiffies + iv); 1648 t->expires = round_jiffies(jiffies + iv);
1649 add_timer_on(t, smp_processor_id()); 1649 add_timer_on(t, cpu);
1650} 1650}
1651 1651
1652static void __mcheck_cpu_init_timer(void) 1652static void __mcheck_cpu_init_timer(void)
@@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu)
2272 dev->release = &mce_device_release; 2272 dev->release = &mce_device_release;
2273 2273
2274 err = device_register(dev); 2274 err = device_register(dev);
2275 if (err) 2275 if (err) {
2276 put_device(dev);
2276 return err; 2277 return err;
2278 }
2277 2279
2278 for (i = 0; mce_device_attrs[i]; i++) { 2280 for (i = 0; mce_device_attrs[i]; i++) {
2279 err = device_create_file(dev, mce_device_attrs[i]); 2281 err = device_create_file(dev, mce_device_attrs[i]);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 4cfe0458ca66..fb6156fee6f7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/gfp.h> 8#include <linux/gfp.h>
9#include <linux/init.h>
10#include <linux/interrupt.h> 9#include <linux/interrupt.h>
11#include <linux/percpu.h> 10#include <linux/percpu.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 1c044b1ccc59..a3042989398c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -5,7 +5,6 @@
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h> 8#include <linux/smp.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index e9a701aecaa1..7dc5564d0cdf 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -5,7 +5,6 @@
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/init.h>
9 8
10#include <asm/processor.h> 9#include <asm/processor.h>
11#include <asm/mce.h> 10#include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 000000000000..285c85427c32
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,7 @@
1microcode-y := core.o
2obj-$(CONFIG_MICROCODE) += microcode.o
3microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
4microcode-$(CONFIG_MICROCODE_AMD) += amd.o
5obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
6obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
7obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index c3d4cc972eca..8fffd845e22b 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd)
182{ 182{
183 u32 rev, dummy; 183 u32 rev, dummy;
184 184
185 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 185 native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
186 186
187 /* verify patch application was successful */ 187 /* verify patch application was successful */
188 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 188 native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
189 if (rev != mc_amd->hdr.patch_id) 189 if (rev != mc_amd->hdr.patch_id)
190 return -1; 190 return -1;
191 191
@@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
332 patch->patch_id = mc_hdr->patch_id; 332 patch->patch_id = mc_hdr->patch_id;
333 patch->equiv_cpu = proc_id; 333 patch->equiv_cpu = proc_id;
334 334
335 pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
336 __func__, patch->patch_id, proc_id);
337
335 /* ... and add to cache. */ 338 /* ... and add to cache. */
336 update_cache(patch); 339 update_cache(patch);
337 340
@@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
390 if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { 393 if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
391 struct ucode_patch *p = find_patch(smp_processor_id()); 394 struct ucode_patch *p = find_patch(smp_processor_id());
392 if (p) { 395 if (p) {
393 memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); 396 memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
394 memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), 397 memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
395 MPB_MAX_SIZE)); 398 PATCH_MAX_SIZE));
396 } 399 }
397 } 400 }
398#endif 401#endif
@@ -430,7 +433,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
430 if (c->x86 >= 0x15) 433 if (c->x86 >= 0x15)
431 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); 434 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
432 435
433 if (request_firmware(&fw, (const char *)fw_name, device)) { 436 if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
434 pr_debug("failed to load file %s\n", fw_name); 437 pr_debug("failed to load file %s\n", fw_name);
435 goto out; 438 goto out;
436 } 439 }
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 6073104ccaa3..8384c0fa206f 100644
--- a/arch/x86/kernel/microcode_amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -2,6 +2,7 @@
2 * Copyright (C) 2013 Advanced Micro Devices, Inc. 2 * Copyright (C) 2013 Advanced Micro Devices, Inc.
3 * 3 *
4 * Author: Jacob Shin <jacob.shin@amd.com> 4 * Author: Jacob Shin <jacob.shin@amd.com>
5 * Fixes: Borislav Petkov <bp@suse.de>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
@@ -15,10 +16,18 @@
15#include <asm/setup.h> 16#include <asm/setup.h>
16#include <asm/microcode_amd.h> 17#include <asm/microcode_amd.h>
17 18
18static bool ucode_loaded; 19/*
20 * This points to the current valid container of microcode patches which we will
21 * save from the initrd before jettisoning its contents.
22 */
23static u8 *container;
24static size_t container_size;
25
19static u32 ucode_new_rev; 26static u32 ucode_new_rev;
20static unsigned long ucode_offset; 27u8 amd_ucode_patch[PATCH_MAX_SIZE];
21static size_t ucode_size; 28static u16 this_equiv_id;
29
30struct cpio_data ucode_cpio;
22 31
23/* 32/*
24 * Microcode patch container file is prepended to the initrd in cpio format. 33 * Microcode patch container file is prepended to the initrd in cpio format.
@@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void)
32 char *path; 41 char *path;
33 void *start; 42 void *start;
34 size_t size; 43 size_t size;
35 unsigned long *uoffset;
36 size_t *usize;
37 struct cpio_data cd;
38 44
39#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
40 struct boot_params *p; 46 struct boot_params *p;
@@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void)
47 path = (char *)__pa_nodebug(ucode_path); 53 path = (char *)__pa_nodebug(ucode_path);
48 start = (void *)p->hdr.ramdisk_image; 54 start = (void *)p->hdr.ramdisk_image;
49 size = p->hdr.ramdisk_size; 55 size = p->hdr.ramdisk_size;
50 uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
51 usize = (size_t *)__pa_nodebug(&ucode_size);
52#else 56#else
53 path = ucode_path; 57 path = ucode_path;
54 start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); 58 start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
55 size = boot_params.hdr.ramdisk_size; 59 size = boot_params.hdr.ramdisk_size;
56 uoffset = &ucode_offset;
57 usize = &ucode_size;
58#endif 60#endif
59 61
60 cd = find_cpio_data(path, start, size, &offset); 62 return find_cpio_data(path, start, size, &offset);
61 if (!cd.data) 63}
62 return cd;
63 64
64 if (*(u32 *)cd.data != UCODE_MAGIC) { 65static size_t compute_container_size(u8 *data, u32 total_size)
65 cd.data = NULL; 66{
66 cd.size = 0; 67 size_t size = 0;
67 return cd; 68 u32 *header = (u32 *)data;
68 }
69 69
70 *uoffset = (u8 *)cd.data - (u8 *)start; 70 if (header[0] != UCODE_MAGIC ||
71 *usize = cd.size; 71 header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
72 header[2] == 0) /* size */
73 return size;
72 74
73 return cd; 75 size = header[2] + CONTAINER_HDR_SZ;
76 total_size -= size;
77 data += size;
78
79 while (total_size) {
80 u16 patch_size;
81
82 header = (u32 *)data;
83
84 if (header[0] != UCODE_UCODE_TYPE)
85 break;
86
87 /*
88 * Sanity-check patch size.
89 */
90 patch_size = header[1];
91 if (patch_size > PATCH_MAX_SIZE)
92 break;
93
94 size += patch_size + SECTION_HDR_SIZE;
95 data += patch_size + SECTION_HDR_SIZE;
96 total_size -= patch_size + SECTION_HDR_SIZE;
97 }
98
99 return size;
74} 100}
75 101
76/* 102/*
@@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void)
85static void apply_ucode_in_initrd(void *ucode, size_t size) 111static void apply_ucode_in_initrd(void *ucode, size_t size)
86{ 112{
87 struct equiv_cpu_entry *eq; 113 struct equiv_cpu_entry *eq;
114 size_t *cont_sz;
88 u32 *header; 115 u32 *header;
89 u8 *data; 116 u8 *data, **cont;
90 u16 eq_id = 0; 117 u16 eq_id = 0;
91 int offset, left; 118 int offset, left;
92 u32 rev, eax; 119 u32 rev, eax, ebx, ecx, edx;
93 u32 *new_rev; 120 u32 *new_rev;
94 unsigned long *uoffset;
95 size_t *usize;
96 121
97#ifdef CONFIG_X86_32 122#ifdef CONFIG_X86_32
98 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); 123 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
99 uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); 124 cont_sz = (size_t *)__pa_nodebug(&container_size);
100 usize = (size_t *)__pa_nodebug(&ucode_size); 125 cont = (u8 **)__pa_nodebug(&container);
101#else 126#else
102 new_rev = &ucode_new_rev; 127 new_rev = &ucode_new_rev;
103 uoffset = &ucode_offset; 128 cont_sz = &container_size;
104 usize = &ucode_size; 129 cont = &container;
105#endif 130#endif
106 131
107 data = ucode; 132 data = ucode;
@@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
109 header = (u32 *)data; 134 header = (u32 *)data;
110 135
111 /* find equiv cpu table */ 136 /* find equiv cpu table */
112 137 if (header[0] != UCODE_MAGIC ||
113 if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ 138 header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
114 header[2] == 0) /* size */ 139 header[2] == 0) /* size */
115 return; 140 return;
116 141
117 eax = cpuid_eax(0x00000001); 142 eax = 0x00000001;
143 ecx = 0;
144 native_cpuid(&eax, &ebx, &ecx, &edx);
118 145
119 while (left > 0) { 146 while (left > 0) {
120 eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); 147 eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
121 148
149 *cont = data;
150
151 /* Advance past the container header */
122 offset = header[2] + CONTAINER_HDR_SZ; 152 offset = header[2] + CONTAINER_HDR_SZ;
123 data += offset; 153 data += offset;
124 left -= offset; 154 left -= offset;
125 155
126 eq_id = find_equiv_id(eq, eax); 156 eq_id = find_equiv_id(eq, eax);
127 if (eq_id) 157 if (eq_id) {
158 this_equiv_id = eq_id;
159 *cont_sz = compute_container_size(*cont, left + offset);
160
161 /*
162 * truncate how much we need to iterate over in the
163 * ucode update loop below
164 */
165 left = *cont_sz - offset;
128 break; 166 break;
167 }
129 168
130 /* 169 /*
131 * support multiple container files appended together. if this 170 * support multiple container files appended together. if this
@@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
145 184
146 /* mark where the next microcode container file starts */ 185 /* mark where the next microcode container file starts */
147 offset = data - (u8 *)ucode; 186 offset = data - (u8 *)ucode;
148 *uoffset += offset;
149 *usize -= offset;
150 ucode = data; 187 ucode = data;
151 } 188 }
152 189
153 if (!eq_id) { 190 if (!eq_id) {
154 *usize = 0; 191 *cont = NULL;
192 *cont_sz = 0;
155 return; 193 return;
156 } 194 }
157 195
158 /* find ucode and update if needed */ 196 /* find ucode and update if needed */
159 197
160 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); 198 native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
161 199
162 while (left > 0) { 200 while (left > 0) {
163 struct microcode_amd *mc; 201 struct microcode_amd *mc;
@@ -168,73 +206,83 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
168 break; 206 break;
169 207
170 mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); 208 mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
171 if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) 209
172 if (__apply_microcode_amd(mc) == 0) { 210 if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
211
212 if (!__apply_microcode_amd(mc)) {
173 rev = mc->hdr.patch_id; 213 rev = mc->hdr.patch_id;
174 *new_rev = rev; 214 *new_rev = rev;
215
216 /* save ucode patch */
217 memcpy(amd_ucode_patch, mc,
218 min_t(u32, header[1], PATCH_MAX_SIZE));
175 } 219 }
220 }
176 221
177 offset = header[1] + SECTION_HDR_SIZE; 222 offset = header[1] + SECTION_HDR_SIZE;
178 data += offset; 223 data += offset;
179 left -= offset; 224 left -= offset;
180 } 225 }
181
182 /* mark where this microcode container file ends */
183 offset = *usize - (data - (u8 *)ucode);
184 *usize -= offset;
185
186 if (!(*new_rev))
187 *usize = 0;
188} 226}
189 227
190void __init load_ucode_amd_bsp(void) 228void __init load_ucode_amd_bsp(void)
191{ 229{
192 struct cpio_data cd = find_ucode_in_initrd(); 230 struct cpio_data cp;
193 if (!cd.data) 231 void **data;
232 size_t *size;
233
234#ifdef CONFIG_X86_32
235 data = (void **)__pa_nodebug(&ucode_cpio.data);
236 size = (size_t *)__pa_nodebug(&ucode_cpio.size);
237#else
238 data = &ucode_cpio.data;
239 size = &ucode_cpio.size;
240#endif
241
242 cp = find_ucode_in_initrd();
243 if (!cp.data)
194 return; 244 return;
195 245
196 apply_ucode_in_initrd(cd.data, cd.size); 246 *data = cp.data;
247 *size = cp.size;
248
249 apply_ucode_in_initrd(cp.data, cp.size);
197} 250}
198 251
199#ifdef CONFIG_X86_32 252#ifdef CONFIG_X86_32
200u8 amd_bsp_mpb[MPB_MAX_SIZE];
201
202/* 253/*
203 * On 32-bit, since AP's early load occurs before paging is turned on, we 254 * On 32-bit, since AP's early load occurs before paging is turned on, we
204 * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during 255 * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
205 * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During 256 * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
206 * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which 257 * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
207 * is used upon resume from suspend. 258 * which is used upon resume from suspend.
208 */ 259 */
209void load_ucode_amd_ap(void) 260void load_ucode_amd_ap(void)
210{ 261{
211 struct microcode_amd *mc; 262 struct microcode_amd *mc;
212 unsigned long *initrd;
213 unsigned long *uoffset;
214 size_t *usize; 263 size_t *usize;
215 void *ucode; 264 void **ucode;
216 265
217 mc = (struct microcode_amd *)__pa(amd_bsp_mpb); 266 mc = (struct microcode_amd *)__pa(amd_ucode_patch);
218 if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { 267 if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
219 __apply_microcode_amd(mc); 268 __apply_microcode_amd(mc);
220 return; 269 return;
221 } 270 }
222 271
223 initrd = (unsigned long *)__pa(&initrd_start); 272 ucode = (void *)__pa_nodebug(&container);
224 uoffset = (unsigned long *)__pa(&ucode_offset); 273 usize = (size_t *)__pa_nodebug(&container_size);
225 usize = (size_t *)__pa(&ucode_size);
226 274
227 if (!*usize || !*initrd) 275 if (!*ucode || !*usize)
228 return; 276 return;
229 277
230 ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset); 278 apply_ucode_in_initrd(*ucode, *usize);
231 apply_ucode_in_initrd(ucode, *usize);
232} 279}
233 280
234static void __init collect_cpu_sig_on_bsp(void *arg) 281static void __init collect_cpu_sig_on_bsp(void *arg)
235{ 282{
236 unsigned int cpu = smp_processor_id(); 283 unsigned int cpu = smp_processor_id();
237 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 284 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
285
238 uci->cpu_sig.sig = cpuid_eax(0x00000001); 286 uci->cpu_sig.sig = cpuid_eax(0x00000001);
239} 287}
240#else 288#else
@@ -242,36 +290,54 @@ void load_ucode_amd_ap(void)
242{ 290{
243 unsigned int cpu = smp_processor_id(); 291 unsigned int cpu = smp_processor_id();
244 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 292 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
293 struct equiv_cpu_entry *eq;
294 struct microcode_amd *mc;
245 u32 rev, eax; 295 u32 rev, eax;
296 u16 eq_id;
297
298 /* Exit if called on the BSP. */
299 if (!cpu)
300 return;
301
302 if (!container)
303 return;
246 304
247 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); 305 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
248 eax = cpuid_eax(0x00000001);
249 306
250 uci->cpu_sig.rev = rev; 307 uci->cpu_sig.rev = rev;
251 uci->cpu_sig.sig = eax; 308 uci->cpu_sig.sig = eax;
252 309
253 if (cpu && !ucode_loaded) { 310 eax = cpuid_eax(0x00000001);
254 void *ucode; 311 eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
255 312
256 if (!ucode_size || !initrd_start) 313 eq_id = find_equiv_id(eq, eax);
257 return; 314 if (!eq_id)
315 return;
316
317 if (eq_id == this_equiv_id) {
318 mc = (struct microcode_amd *)amd_ucode_patch;
258 319
259 ucode = (void *)(initrd_start + ucode_offset); 320 if (mc && rev < mc->hdr.patch_id) {
260 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 321 if (!__apply_microcode_amd(mc))
261 if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) 322 ucode_new_rev = mc->hdr.patch_id;
323 }
324
325 } else {
326 if (!ucode_cpio.data)
262 return; 327 return;
263 328
264 ucode_loaded = true; 329 /*
330 * AP has a different equivalence ID than BSP, looks like
331 * mixed-steppings silicon so go through the ucode blob anew.
332 */
333 apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
265 } 334 }
266
267 apply_microcode_amd(cpu);
268} 335}
269#endif 336#endif
270 337
271int __init save_microcode_in_initrd_amd(void) 338int __init save_microcode_in_initrd_amd(void)
272{ 339{
273 enum ucode_state ret; 340 enum ucode_state ret;
274 void *ucode;
275 u32 eax; 341 u32 eax;
276 342
277#ifdef CONFIG_X86_32 343#ifdef CONFIG_X86_32
@@ -280,22 +346,35 @@ int __init save_microcode_in_initrd_amd(void)
280 346
281 if (!uci->cpu_sig.sig) 347 if (!uci->cpu_sig.sig)
282 smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); 348 smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
349
350 /*
351 * Take into account the fact that the ramdisk might get relocated
352 * and therefore we need to recompute the container's position in
353 * virtual memory space.
354 */
355 container = (u8 *)(__va((u32)relocated_ramdisk) +
356 ((u32)container - boot_params.hdr.ramdisk_image));
283#endif 357#endif
284 if (ucode_new_rev) 358 if (ucode_new_rev)
285 pr_info("microcode: updated early to new patch_level=0x%08x\n", 359 pr_info("microcode: updated early to new patch_level=0x%08x\n",
286 ucode_new_rev); 360 ucode_new_rev);
287 361
288 if (ucode_loaded || !ucode_size || !initrd_start) 362 if (!container)
289 return 0; 363 return -EINVAL;
290 364
291 ucode = (void *)(initrd_start + ucode_offset);
292 eax = cpuid_eax(0x00000001); 365 eax = cpuid_eax(0x00000001);
293 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 366 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
294 367
295 ret = load_microcode_amd(eax, ucode, ucode_size); 368 ret = load_microcode_amd(eax, container, container_size);
296 if (ret != UCODE_OK) 369 if (ret != UCODE_OK)
297 return -EINVAL; 370 return -EINVAL;
298 371
299 ucode_loaded = true; 372 /*
373 * This will be freed any msec now, stash patches for the current
374 * family and switch to patch cache for cpu hotplug, etc later.
375 */
376 container = NULL;
377 container_size = 0;
378
300 return 0; 379 return 0;
301} 380}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c987698b0f..15c987698b0f 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index be7f8514f577..be7f8514f577 100644
--- a/arch/x86/kernel/microcode_core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5fb2cebf556b..a276fa75d9b5 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
278 sprintf(name, "intel-ucode/%02x-%02x-%02x", 278 sprintf(name, "intel-ucode/%02x-%02x-%02x",
279 c->x86, c->x86_model, c->x86_mask); 279 c->x86, c->x86_model, c->x86_mask);
280 280
281 if (request_firmware(&firmware, name, device)) { 281 if (request_firmware_direct(&firmware, name, device)) {
282 pr_debug("data file %s load failed\n", name); 282 pr_debug("data file %s load failed\n", name);
283 return UCODE_NFOUND; 283 return UCODE_NFOUND;
284 } 284 }
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 1575deb2e636..18f739129e72 100644
--- a/arch/x86/kernel/microcode_intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -365,16 +365,6 @@ out:
365 return state; 365 return state;
366} 366}
367 367
368#define native_rdmsr(msr, val1, val2) \
369do { \
370 u64 __val = native_read_msr((msr)); \
371 (void)((val1) = (u32)__val); \
372 (void)((val2) = (u32)(__val >> 32)); \
373} while (0)
374
375#define native_wrmsr(msr, low, high) \
376 native_write_msr(msr, low, high);
377
378static int collect_cpu_info_early(struct ucode_cpu_info *uci) 368static int collect_cpu_info_early(struct ucode_cpu_info *uci)
379{ 369{
380 unsigned int val[2]; 370 unsigned int val[2];
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..ce69320d0179 100644
--- a/arch/x86/kernel/microcode_intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e132931614d..b88645191fe5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1883,21 +1883,27 @@ static struct pmu pmu = {
1883 1883
1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1885{ 1885{
1886 struct cyc2ns_data *data;
1887
1886 userpg->cap_user_time = 0; 1888 userpg->cap_user_time = 0;
1887 userpg->cap_user_time_zero = 0; 1889 userpg->cap_user_time_zero = 0;
1888 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; 1890 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
1889 userpg->pmc_width = x86_pmu.cntval_bits; 1891 userpg->pmc_width = x86_pmu.cntval_bits;
1890 1892
1891 if (!sched_clock_stable) 1893 if (!sched_clock_stable())
1892 return; 1894 return;
1893 1895
1896 data = cyc2ns_read_begin();
1897
1894 userpg->cap_user_time = 1; 1898 userpg->cap_user_time = 1;
1895 userpg->time_mult = this_cpu_read(cyc2ns); 1899 userpg->time_mult = data->cyc2ns_mul;
1896 userpg->time_shift = CYC2NS_SCALE_FACTOR; 1900 userpg->time_shift = data->cyc2ns_shift;
1897 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; 1901 userpg->time_offset = data->cyc2ns_offset - now;
1898 1902
1899 userpg->cap_user_time_zero = 1; 1903 userpg->cap_user_time_zero = 1;
1900 userpg->time_zero = this_cpu_read(cyc2ns_offset); 1904 userpg->time_zero = data->cyc2ns_offset;
1905
1906 cyc2ns_read_end(data);
1901} 1907}
1902 1908
1903/* 1909/*
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index e09f0bfb7b8f..4b8e4d3cd6ea 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/syscore_ops.h>
13 14
14#include <asm/apic.h> 15#include <asm/apic.h>
15 16
@@ -816,6 +817,18 @@ out:
816 return ret; 817 return ret;
817} 818}
818 819
820static void ibs_eilvt_setup(void)
821{
822 /*
823 * Force LVT offset assignment for family 10h: The offsets are
824 * not assigned by the BIOS for this family, so the OS is
825 * responsible for doing it. If the OS assignment fails, fall
826 * back to BIOS settings and try to setup this.
827 */
828 if (boot_cpu_data.x86 == 0x10)
829 force_ibs_eilvt_setup();
830}
831
819static inline int get_ibs_lvt_offset(void) 832static inline int get_ibs_lvt_offset(void)
820{ 833{
821 u64 val; 834 u64 val;
@@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy)
851 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); 864 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
852} 865}
853 866
867#ifdef CONFIG_PM
868
869static int perf_ibs_suspend(void)
870{
871 clear_APIC_ibs(NULL);
872 return 0;
873}
874
875static void perf_ibs_resume(void)
876{
877 ibs_eilvt_setup();
878 setup_APIC_ibs(NULL);
879}
880
881static struct syscore_ops perf_ibs_syscore_ops = {
882 .resume = perf_ibs_resume,
883 .suspend = perf_ibs_suspend,
884};
885
886static void perf_ibs_pm_init(void)
887{
888 register_syscore_ops(&perf_ibs_syscore_ops);
889}
890
891#else
892
893static inline void perf_ibs_pm_init(void) { }
894
895#endif
896
854static int 897static int
855perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) 898perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
856{ 899{
@@ -877,18 +920,12 @@ static __init int amd_ibs_init(void)
877 if (!caps) 920 if (!caps)
878 return -ENODEV; /* ibs not supported by the cpu */ 921 return -ENODEV; /* ibs not supported by the cpu */
879 922
880 /* 923 ibs_eilvt_setup();
881 * Force LVT offset assignment for family 10h: The offsets are
882 * not assigned by the BIOS for this family, so the OS is
883 * responsible for doing it. If the OS assignment fails, fall
884 * back to BIOS settings and try to setup this.
885 */
886 if (boot_cpu_data.x86 == 0x10)
887 force_ibs_eilvt_setup();
888 924
889 if (!ibs_eilvt_valid()) 925 if (!ibs_eilvt_valid())
890 goto out; 926 goto out;
891 927
928 perf_ibs_pm_init();
892 get_online_cpus(); 929 get_online_cpus();
893 ibs_caps = caps; 930 ibs_caps = caps;
894 /* make ibs_caps visible to other cpus: */ 931 /* make ibs_caps visible to other cpus: */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
new file mode 100644
index 000000000000..5ad35ad94d0f
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -0,0 +1,679 @@
1/*
2 * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
3 * Copyright (C) 2013 Google, Inc., Stephane Eranian
4 *
5 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
6 * section 14.7.1 (September 2013)
7 *
8 * RAPL provides more controls than just reporting energy consumption
9 * however here we only expose the 3 energy consumption free running
10 * counters (pp0, pkg, dram).
11 *
12 * Each of those counters increments in a power unit defined by the
13 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
14 * but it can vary.
15 *
16 * Counter to rapl events mappings:
17 *
18 * pp0 counter: consumption of all physical cores (power plane 0)
19 * event: rapl_energy_cores
20 * perf code: 0x1
21 *
22 * pkg counter: consumption of the whole processor package
23 * event: rapl_energy_pkg
24 * perf code: 0x2
25 *
26 * dram counter: consumption of the dram domain (servers only)
27 * event: rapl_energy_dram
28 * perf code: 0x3
29 *
30 * dram counter: consumption of the builtin-gpu domain (client only)
31 * event: rapl_energy_gpu
32 * perf code: 0x4
33 *
34 * We manage those counters as free running (read-only). They may be
35 * use simultaneously by other tools, such as turbostat.
36 *
37 * The events only support system-wide mode counting. There is no
38 * sampling support because it does not make sense and is not
39 * supported by the RAPL hardware.
40 *
41 * Because we want to avoid floating-point operations in the kernel,
42 * the events are all reported in fixed point arithmetic (32.32).
43 * Tools must adjust the counts to convert them to Watts using
44 * the duration of the measurement. Tools may use a function such as
45 * ldexp(raw_count, -32);
46 */
47#include <linux/module.h>
48#include <linux/slab.h>
49#include <linux/perf_event.h>
50#include <asm/cpu_device_id.h>
51#include "perf_event.h"
52
53/*
54 * RAPL energy status counters
55 */
56#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
57#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
58#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
59#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
60#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
61#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
62#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */
63#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
64
65/* Clients have PP0, PKG */
66#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
67 1<<RAPL_IDX_PKG_NRG_STAT|\
68 1<<RAPL_IDX_PP1_NRG_STAT)
69
70/* Servers have PP0, PKG, RAM */
71#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
72 1<<RAPL_IDX_PKG_NRG_STAT|\
73 1<<RAPL_IDX_RAM_NRG_STAT)
74
75/*
76 * event code: LSB 8 bits, passed in attr->config
77 * any other bit is reserved
78 */
79#define RAPL_EVENT_MASK 0xFFULL
80
81#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
82static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
83 struct kobj_attribute *attr, \
84 char *page) \
85{ \
86 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
87 return sprintf(page, _format "\n"); \
88} \
89static struct kobj_attribute format_attr_##_var = \
90 __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
91
92#define RAPL_EVENT_DESC(_name, _config) \
93{ \
94 .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
95 .config = _config, \
96}
97
98#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
99
100struct rapl_pmu {
101 spinlock_t lock;
102 int hw_unit; /* 1/2^hw_unit Joule */
103 int n_active; /* number of active events */
104 struct list_head active_list;
105 struct pmu *pmu; /* pointer to rapl_pmu_class */
106 ktime_t timer_interval; /* in ktime_t unit */
107 struct hrtimer hrtimer;
108};
109
110static struct pmu rapl_pmu_class;
111static cpumask_t rapl_cpu_mask;
112static int rapl_cntr_mask;
113
114static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
115static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
116
117static inline u64 rapl_read_counter(struct perf_event *event)
118{
119 u64 raw;
120 rdmsrl(event->hw.event_base, raw);
121 return raw;
122}
123
124static inline u64 rapl_scale(u64 v)
125{
126 /*
127 * scale delta to smallest unit (1/2^32)
128 * users must then scale back: count * 1/(1e9*2^32) to get Joules
129 * or use ldexp(count, -32).
130 * Watts = Joules/Time delta
131 */
132 return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
133}
134
135static u64 rapl_event_update(struct perf_event *event)
136{
137 struct hw_perf_event *hwc = &event->hw;
138 u64 prev_raw_count, new_raw_count;
139 s64 delta, sdelta;
140 int shift = RAPL_CNTR_WIDTH;
141
142again:
143 prev_raw_count = local64_read(&hwc->prev_count);
144 rdmsrl(event->hw.event_base, new_raw_count);
145
146 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
147 new_raw_count) != prev_raw_count) {
148 cpu_relax();
149 goto again;
150 }
151
152 /*
153 * Now we have the new raw value and have updated the prev
154 * timestamp already. We can now calculate the elapsed delta
155 * (event-)time and add that to the generic event.
156 *
157 * Careful, not all hw sign-extends above the physical width
158 * of the count.
159 */
160 delta = (new_raw_count << shift) - (prev_raw_count << shift);
161 delta >>= shift;
162
163 sdelta = rapl_scale(delta);
164
165 local64_add(sdelta, &event->count);
166
167 return new_raw_count;
168}
169
170static void rapl_start_hrtimer(struct rapl_pmu *pmu)
171{
172 __hrtimer_start_range_ns(&pmu->hrtimer,
173 pmu->timer_interval, 0,
174 HRTIMER_MODE_REL_PINNED, 0);
175}
176
177static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
178{
179 hrtimer_cancel(&pmu->hrtimer);
180}
181
182static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
183{
184 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
185 struct perf_event *event;
186 unsigned long flags;
187
188 if (!pmu->n_active)
189 return HRTIMER_NORESTART;
190
191 spin_lock_irqsave(&pmu->lock, flags);
192
193 list_for_each_entry(event, &pmu->active_list, active_entry) {
194 rapl_event_update(event);
195 }
196
197 spin_unlock_irqrestore(&pmu->lock, flags);
198
199 hrtimer_forward_now(hrtimer, pmu->timer_interval);
200
201 return HRTIMER_RESTART;
202}
203
204static void rapl_hrtimer_init(struct rapl_pmu *pmu)
205{
206 struct hrtimer *hr = &pmu->hrtimer;
207
208 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
209 hr->function = rapl_hrtimer_handle;
210}
211
212static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
213 struct perf_event *event)
214{
215 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
216 return;
217
218 event->hw.state = 0;
219
220 list_add_tail(&event->active_entry, &pmu->active_list);
221
222 local64_set(&event->hw.prev_count, rapl_read_counter(event));
223
224 pmu->n_active++;
225 if (pmu->n_active == 1)
226 rapl_start_hrtimer(pmu);
227}
228
229static void rapl_pmu_event_start(struct perf_event *event, int mode)
230{
231 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
232 unsigned long flags;
233
234 spin_lock_irqsave(&pmu->lock, flags);
235 __rapl_pmu_event_start(pmu, event);
236 spin_unlock_irqrestore(&pmu->lock, flags);
237}
238
239static void rapl_pmu_event_stop(struct perf_event *event, int mode)
240{
241 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
242 struct hw_perf_event *hwc = &event->hw;
243 unsigned long flags;
244
245 spin_lock_irqsave(&pmu->lock, flags);
246
247 /* mark event as deactivated and stopped */
248 if (!(hwc->state & PERF_HES_STOPPED)) {
249 WARN_ON_ONCE(pmu->n_active <= 0);
250 pmu->n_active--;
251 if (pmu->n_active == 0)
252 rapl_stop_hrtimer(pmu);
253
254 list_del(&event->active_entry);
255
256 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
257 hwc->state |= PERF_HES_STOPPED;
258 }
259
260 /* check if update of sw counter is necessary */
261 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
262 /*
263 * Drain the remaining delta count out of a event
264 * that we are disabling:
265 */
266 rapl_event_update(event);
267 hwc->state |= PERF_HES_UPTODATE;
268 }
269
270 spin_unlock_irqrestore(&pmu->lock, flags);
271}
272
273static int rapl_pmu_event_add(struct perf_event *event, int mode)
274{
275 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
276 struct hw_perf_event *hwc = &event->hw;
277 unsigned long flags;
278
279 spin_lock_irqsave(&pmu->lock, flags);
280
281 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
282
283 if (mode & PERF_EF_START)
284 __rapl_pmu_event_start(pmu, event);
285
286 spin_unlock_irqrestore(&pmu->lock, flags);
287
288 return 0;
289}
290
291static void rapl_pmu_event_del(struct perf_event *event, int flags)
292{
293 rapl_pmu_event_stop(event, PERF_EF_UPDATE);
294}
295
296static int rapl_pmu_event_init(struct perf_event *event)
297{
298 u64 cfg = event->attr.config & RAPL_EVENT_MASK;
299 int bit, msr, ret = 0;
300
301 /* only look at RAPL events */
302 if (event->attr.type != rapl_pmu_class.type)
303 return -ENOENT;
304
305 /* check only supported bits are set */
306 if (event->attr.config & ~RAPL_EVENT_MASK)
307 return -EINVAL;
308
309 /*
310 * check event is known (determines counter)
311 */
312 switch (cfg) {
313 case INTEL_RAPL_PP0:
314 bit = RAPL_IDX_PP0_NRG_STAT;
315 msr = MSR_PP0_ENERGY_STATUS;
316 break;
317 case INTEL_RAPL_PKG:
318 bit = RAPL_IDX_PKG_NRG_STAT;
319 msr = MSR_PKG_ENERGY_STATUS;
320 break;
321 case INTEL_RAPL_RAM:
322 bit = RAPL_IDX_RAM_NRG_STAT;
323 msr = MSR_DRAM_ENERGY_STATUS;
324 break;
325 case INTEL_RAPL_PP1:
326 bit = RAPL_IDX_PP1_NRG_STAT;
327 msr = MSR_PP1_ENERGY_STATUS;
328 break;
329 default:
330 return -EINVAL;
331 }
332 /* check event supported */
333 if (!(rapl_cntr_mask & (1 << bit)))
334 return -EINVAL;
335
336 /* unsupported modes and filters */
337 if (event->attr.exclude_user ||
338 event->attr.exclude_kernel ||
339 event->attr.exclude_hv ||
340 event->attr.exclude_idle ||
341 event->attr.exclude_host ||
342 event->attr.exclude_guest ||
343 event->attr.sample_period) /* no sampling */
344 return -EINVAL;
345
346 /* must be done before validate_group */
347 event->hw.event_base = msr;
348 event->hw.config = cfg;
349 event->hw.idx = bit;
350
351 return ret;
352}
353
354static void rapl_pmu_event_read(struct perf_event *event)
355{
356 rapl_event_update(event);
357}
358
359static ssize_t rapl_get_attr_cpumask(struct device *dev,
360 struct device_attribute *attr, char *buf)
361{
362 int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
363
364 buf[n++] = '\n';
365 buf[n] = '\0';
366 return n;
367}
368
369static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
370
371static struct attribute *rapl_pmu_attrs[] = {
372 &dev_attr_cpumask.attr,
373 NULL,
374};
375
376static struct attribute_group rapl_pmu_attr_group = {
377 .attrs = rapl_pmu_attrs,
378};
379
380EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
381EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
382EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
383EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
384
385EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
386EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
387EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
388EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
389
390/*
391 * we compute in 0.23 nJ increments regardless of MSR
392 */
393EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
394EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
395EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
396EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
397
398static struct attribute *rapl_events_srv_attr[] = {
399 EVENT_PTR(rapl_cores),
400 EVENT_PTR(rapl_pkg),
401 EVENT_PTR(rapl_ram),
402
403 EVENT_PTR(rapl_cores_unit),
404 EVENT_PTR(rapl_pkg_unit),
405 EVENT_PTR(rapl_ram_unit),
406
407 EVENT_PTR(rapl_cores_scale),
408 EVENT_PTR(rapl_pkg_scale),
409 EVENT_PTR(rapl_ram_scale),
410 NULL,
411};
412
413static struct attribute *rapl_events_cln_attr[] = {
414 EVENT_PTR(rapl_cores),
415 EVENT_PTR(rapl_pkg),
416 EVENT_PTR(rapl_gpu),
417
418 EVENT_PTR(rapl_cores_unit),
419 EVENT_PTR(rapl_pkg_unit),
420 EVENT_PTR(rapl_gpu_unit),
421
422 EVENT_PTR(rapl_cores_scale),
423 EVENT_PTR(rapl_pkg_scale),
424 EVENT_PTR(rapl_gpu_scale),
425 NULL,
426};
427
428static struct attribute_group rapl_pmu_events_group = {
429 .name = "events",
430 .attrs = NULL, /* patched at runtime */
431};
432
433DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
434static struct attribute *rapl_formats_attr[] = {
435 &format_attr_event.attr,
436 NULL,
437};
438
439static struct attribute_group rapl_pmu_format_group = {
440 .name = "format",
441 .attrs = rapl_formats_attr,
442};
443
444const struct attribute_group *rapl_attr_groups[] = {
445 &rapl_pmu_attr_group,
446 &rapl_pmu_format_group,
447 &rapl_pmu_events_group,
448 NULL,
449};
450
451static struct pmu rapl_pmu_class = {
452 .attr_groups = rapl_attr_groups,
453 .task_ctx_nr = perf_invalid_context, /* system-wide only */
454 .event_init = rapl_pmu_event_init,
455 .add = rapl_pmu_event_add, /* must have */
456 .del = rapl_pmu_event_del, /* must have */
457 .start = rapl_pmu_event_start,
458 .stop = rapl_pmu_event_stop,
459 .read = rapl_pmu_event_read,
460};
461
462static void rapl_cpu_exit(int cpu)
463{
464 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
465 int i, phys_id = topology_physical_package_id(cpu);
466 int target = -1;
467
468 /* find a new cpu on same package */
469 for_each_online_cpu(i) {
470 if (i == cpu)
471 continue;
472 if (phys_id == topology_physical_package_id(i)) {
473 target = i;
474 break;
475 }
476 }
477 /*
478 * clear cpu from cpumask
479 * if was set in cpumask and still some cpu on package,
480 * then move to new cpu
481 */
482 if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
483 cpumask_set_cpu(target, &rapl_cpu_mask);
484
485 WARN_ON(cpumask_empty(&rapl_cpu_mask));
486 /*
487 * migrate events and context to new cpu
488 */
489 if (target >= 0)
490 perf_pmu_migrate_context(pmu->pmu, cpu, target);
491
492 /* cancel overflow polling timer for CPU */
493 rapl_stop_hrtimer(pmu);
494}
495
496static void rapl_cpu_init(int cpu)
497{
498 int i, phys_id = topology_physical_package_id(cpu);
499
500 /* check if phys_is is already covered */
501 for_each_cpu(i, &rapl_cpu_mask) {
502 if (phys_id == topology_physical_package_id(i))
503 return;
504 }
505 /* was not found, so add it */
506 cpumask_set_cpu(cpu, &rapl_cpu_mask);
507}
508
509static int rapl_cpu_prepare(int cpu)
510{
511 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
512 int phys_id = topology_physical_package_id(cpu);
513 u64 ms;
514
515 if (pmu)
516 return 0;
517
518 if (phys_id < 0)
519 return -1;
520
521 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
522 if (!pmu)
523 return -1;
524
525 spin_lock_init(&pmu->lock);
526
527 INIT_LIST_HEAD(&pmu->active_list);
528
529 /*
530 * grab power unit as: 1/2^unit Joules
531 *
532 * we cache in local PMU instance
533 */
534 rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit);
535 pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL;
536 pmu->pmu = &rapl_pmu_class;
537
538 /*
539 * use reference of 200W for scaling the timeout
540 * to avoid missing counter overflows.
541 * 200W = 200 Joules/sec
542 * divide interval by 2 to avoid lockstep (2 * 100)
543 * if hw unit is 32, then we use 2 ms 1/200/2
544 */
545 if (pmu->hw_unit < 32)
546 ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
547 else
548 ms = 2;
549
550 pmu->timer_interval = ms_to_ktime(ms);
551
552 rapl_hrtimer_init(pmu);
553
554 /* set RAPL pmu for this cpu for now */
555 per_cpu(rapl_pmu, cpu) = pmu;
556 per_cpu(rapl_pmu_to_free, cpu) = NULL;
557
558 return 0;
559}
560
561static void rapl_cpu_kfree(int cpu)
562{
563 struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
564
565 kfree(pmu);
566
567 per_cpu(rapl_pmu_to_free, cpu) = NULL;
568}
569
570static int rapl_cpu_dying(int cpu)
571{
572 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
573
574 if (!pmu)
575 return 0;
576
577 per_cpu(rapl_pmu, cpu) = NULL;
578
579 per_cpu(rapl_pmu_to_free, cpu) = pmu;
580
581 return 0;
582}
583
584static int rapl_cpu_notifier(struct notifier_block *self,
585 unsigned long action, void *hcpu)
586{
587 unsigned int cpu = (long)hcpu;
588
589 switch (action & ~CPU_TASKS_FROZEN) {
590 case CPU_UP_PREPARE:
591 rapl_cpu_prepare(cpu);
592 break;
593 case CPU_STARTING:
594 rapl_cpu_init(cpu);
595 break;
596 case CPU_UP_CANCELED:
597 case CPU_DYING:
598 rapl_cpu_dying(cpu);
599 break;
600 case CPU_ONLINE:
601 case CPU_DEAD:
602 rapl_cpu_kfree(cpu);
603 break;
604 case CPU_DOWN_PREPARE:
605 rapl_cpu_exit(cpu);
606 break;
607 default:
608 break;
609 }
610
611 return NOTIFY_OK;
612}
613
614static const struct x86_cpu_id rapl_cpu_match[] = {
615 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
616 [1] = {},
617};
618
619static int __init rapl_pmu_init(void)
620{
621 struct rapl_pmu *pmu;
622 int cpu, ret;
623
624 /*
625 * check for Intel processor family 6
626 */
627 if (!x86_match_cpu(rapl_cpu_match))
628 return 0;
629
630 /* check supported CPU */
631 switch (boot_cpu_data.x86_model) {
632 case 42: /* Sandy Bridge */
633 case 58: /* Ivy Bridge */
634 case 60: /* Haswell */
635 case 69: /* Haswell-Celeron */
636 rapl_cntr_mask = RAPL_IDX_CLN;
637 rapl_pmu_events_group.attrs = rapl_events_cln_attr;
638 break;
639 case 45: /* Sandy Bridge-EP */
640 case 62: /* IvyTown */
641 rapl_cntr_mask = RAPL_IDX_SRV;
642 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
643 break;
644
645 default:
646 /* unsupported */
647 return 0;
648 }
649 get_online_cpus();
650
651 for_each_online_cpu(cpu) {
652 rapl_cpu_prepare(cpu);
653 rapl_cpu_init(cpu);
654 }
655
656 perf_cpu_notifier(rapl_cpu_notifier);
657
658 ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
659 if (WARN_ON(ret)) {
660 pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
661 put_online_cpus();
662 return -1;
663 }
664
665 pmu = __get_cpu_var(rapl_pmu);
666
667 pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
668 " API unit is 2^-32 Joules,"
669 " %d fixed counters"
670 " %llu ms ovfl timer\n",
671 pmu->hw_unit,
672 hweight32(rapl_cntr_mask),
673 ktime_to_ms(pmu->timer_interval));
674
675 put_online_cpus();
676
677 return 0;
678}
679device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 88db010845cb..384df5105fbc 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -31,20 +31,6 @@ static int __init x86_rdrand_setup(char *s)
31} 31}
32__setup("nordrand", x86_rdrand_setup); 32__setup("nordrand", x86_rdrand_setup);
33 33
34/* We can't use arch_get_random_long() here since alternatives haven't run */
35static inline int rdrand_long(unsigned long *v)
36{
37 int ok;
38 asm volatile("1: " RDRAND_LONG "\n\t"
39 "jc 2f\n\t"
40 "decl %0\n\t"
41 "jnz 1b\n\t"
42 "2:"
43 : "=r" (ok), "=a" (*v)
44 : "0" (RDRAND_RETRY_LOOPS));
45 return ok;
46}
47
48/* 34/*
49 * Force a reseed cycle; we are architecturally guaranteed a reseed 35 * Force a reseed cycle; we are architecturally guaranteed a reseed
50 * after no more than 512 128-bit chunks of random data. This also 36 * after no more than 512 128-bit chunks of random data. This also
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index aa0430d69b90..3fa0e5ad86b4 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,5 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <linux/init.h>
4#include <asm/processor.h> 3#include <asm/processor.h>
5#include <asm/msr.h> 4#include <asm/msr.h>
6#include "cpu.h" 5#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index 75c5ad5d35cc..ef9c2a0078bd 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,4 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <asm/processor.h> 2#include <asm/processor.h>
4#include "cpu.h" 3#include "cpu.h"
5 4
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 18677a90d6a3..a57902efe2d5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -7,7 +7,6 @@
7 * 7 *
8 */ 8 */
9 9
10#include <linux/init.h>
11#include <linux/types.h> 10#include <linux/types.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/smp.h> 12#include <linux/smp.h>
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 5d3fe8d36e4a..f6dfd9334b67 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,6 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/init_task.h> 3#include <linux/init_task.h>
5#include <linux/fs.h> 4#include <linux/fs.h>
6 5
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5fc5a7b..988c00a1f60d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
1120 nr_pages += end_pfn - start_pfn; 1120 nr_pages += end_pfn - start_pfn;
1121 } 1121 }
1122 1122
1123 for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { 1123 for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); 1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); 1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
1126 if (start_pfn < end_pfn) 1126 if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 51e2988c5728..a2a4f4697889 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1082,7 +1082,7 @@ ENTRY(ftrace_caller)
1082 pushl $0 /* Pass NULL as regs pointer */ 1082 pushl $0 /* Pass NULL as regs pointer */
1083 movl 4*4(%esp), %eax 1083 movl 4*4(%esp), %eax
1084 movl 0x4(%ebp), %edx 1084 movl 0x4(%ebp), %edx
1085 leal function_trace_op, %ecx 1085 movl function_trace_op, %ecx
1086 subl $MCOUNT_INSN_SIZE, %eax 1086 subl $MCOUNT_INSN_SIZE, %eax
1087 1087
1088.globl ftrace_call 1088.globl ftrace_call
@@ -1140,7 +1140,7 @@ ENTRY(ftrace_regs_caller)
1140 movl 12*4(%esp), %eax /* Load ip (1st parameter) */ 1140 movl 12*4(%esp), %eax /* Load ip (1st parameter) */
1141 subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ 1141 subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
1142 movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ 1142 movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
1143 leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ 1143 movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
1144 pushl %esp /* Save pt_regs as 4th parameter */ 1144 pushl %esp /* Save pt_regs as 4th parameter */
1145 1145
1146GLOBAL(ftrace_regs_call) 1146GLOBAL(ftrace_regs_call)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e21b0785a85b..1e96c3628bf2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -88,7 +88,7 @@ END(function_hook)
88 MCOUNT_SAVE_FRAME \skip 88 MCOUNT_SAVE_FRAME \skip
89 89
90 /* Load the ftrace_ops into the 3rd parameter */ 90 /* Load the ftrace_ops into the 3rd parameter */
91 leaq function_trace_op, %rdx 91 movq function_trace_op(%rip), %rdx
92 92
93 /* Load ip into the first parameter */ 93 /* Load ip into the first parameter */
94 movq RIP(%rsp), %rdi 94 movq RIP(%rsp), %rdi
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index f66ff162dce8..a67b47c31314 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -38,7 +38,6 @@
38#include <linux/kernel.h> 38#include <linux/kernel.h>
39#include <linux/module.h> 39#include <linux/module.h>
40#include <linux/sched.h> 40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h> 41#include <linux/smp.h>
43 42
44#include <asm/hw_breakpoint.h> 43#include <asm/hw_breakpoint.h>
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
new file mode 100644
index 000000000000..c3aae6672843
--- /dev/null
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -0,0 +1,226 @@
1/*
2 * IOSF-SB MailBox Interface Driver
3 * Copyright (c) 2013, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 *
15 * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
16 * mailbox interface (MBI) to communicate with mutiple devices. This
17 * driver implements access to this interface for those platforms that can
18 * enumerate the device using PCI.
19 */
20
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/spinlock.h>
24#include <linux/pci.h>
25
26#include <asm/iosf_mbi.h>
27
28static DEFINE_SPINLOCK(iosf_mbi_lock);
29
30static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
31{
32 return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
33}
34
35static struct pci_dev *mbi_pdev; /* one mbi device */
36
37static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
38{
39 int result;
40
41 if (!mbi_pdev)
42 return -ENODEV;
43
44 if (mcrx) {
45 result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
46 mcrx);
47 if (result < 0)
48 goto fail_read;
49 }
50
51 result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
52 if (result < 0)
53 goto fail_read;
54
55 result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
56 if (result < 0)
57 goto fail_read;
58
59 return 0;
60
61fail_read:
62 dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
63 return result;
64}
65
66static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
67{
68 int result;
69
70 if (!mbi_pdev)
71 return -ENODEV;
72
73 result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
74 if (result < 0)
75 goto fail_write;
76
77 if (mcrx) {
78 result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
79 mcrx);
80 if (result < 0)
81 goto fail_write;
82 }
83
84 result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
85 if (result < 0)
86 goto fail_write;
87
88 return 0;
89
90fail_write:
91 dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
92 return result;
93}
94
95int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
96{
97 u32 mcr, mcrx;
98 unsigned long flags;
99 int ret;
100
101 /*Access to the GFX unit is handled by GPU code */
102 if (port == BT_MBI_UNIT_GFX) {
103 WARN_ON(1);
104 return -EPERM;
105 }
106
107 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
108 mcrx = offset & MBI_MASK_HI;
109
110 spin_lock_irqsave(&iosf_mbi_lock, flags);
111 ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
112 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
113
114 return ret;
115}
116EXPORT_SYMBOL(iosf_mbi_read);
117
118int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
119{
120 u32 mcr, mcrx;
121 unsigned long flags;
122 int ret;
123
124 /*Access to the GFX unit is handled by GPU code */
125 if (port == BT_MBI_UNIT_GFX) {
126 WARN_ON(1);
127 return -EPERM;
128 }
129
130 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
131 mcrx = offset & MBI_MASK_HI;
132
133 spin_lock_irqsave(&iosf_mbi_lock, flags);
134 ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
135 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
136
137 return ret;
138}
139EXPORT_SYMBOL(iosf_mbi_write);
140
141int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
142{
143 u32 mcr, mcrx;
144 u32 value;
145 unsigned long flags;
146 int ret;
147
148 /*Access to the GFX unit is handled by GPU code */
149 if (port == BT_MBI_UNIT_GFX) {
150 WARN_ON(1);
151 return -EPERM;
152 }
153
154 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
155 mcrx = offset & MBI_MASK_HI;
156
157 spin_lock_irqsave(&iosf_mbi_lock, flags);
158
159 /* Read current mdr value */
160 ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
161 if (ret < 0) {
162 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
163 return ret;
164 }
165
166 /* Apply mask */
167 value &= ~mask;
168 mdr &= mask;
169 value |= mdr;
170
171 /* Write back */
172 ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
173
174 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
175
176 return ret;
177}
178EXPORT_SYMBOL(iosf_mbi_modify);
179
180static int iosf_mbi_probe(struct pci_dev *pdev,
181 const struct pci_device_id *unused)
182{
183 int ret;
184
185 ret = pci_enable_device(pdev);
186 if (ret < 0) {
187 dev_err(&pdev->dev, "error: could not enable device\n");
188 return ret;
189 }
190
191 mbi_pdev = pci_dev_get(pdev);
192 return 0;
193}
194
195static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
196 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) },
197 { 0, },
198};
199MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
200
201static struct pci_driver iosf_mbi_pci_driver = {
202 .name = "iosf_mbi_pci",
203 .probe = iosf_mbi_probe,
204 .id_table = iosf_mbi_pci_ids,
205};
206
207static int __init iosf_mbi_init(void)
208{
209 return pci_register_driver(&iosf_mbi_pci_driver);
210}
211
212static void __exit iosf_mbi_exit(void)
213{
214 pci_unregister_driver(&iosf_mbi_pci_driver);
215 if (mbi_pdev) {
216 pci_dev_put(mbi_pdev);
217 mbi_pdev = NULL;
218 }
219}
220
221module_init(iosf_mbi_init);
222module_exit(iosf_mbi_exit);
223
224MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
225MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
226MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687e7fda..dbb60878b744 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
193 if (!handle_irq(irq, regs)) { 193 if (!handle_irq(irq, regs)) {
194 ack_APIC_irq(); 194 ack_APIC_irq();
195 195
196 if (printk_ratelimit()) 196 if (irq != VECTOR_RETRIGGERED) {
197 pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", 197 pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
198 __func__, smp_processor_id(), vector, irq); 198 __func__, smp_processor_id(),
199 vector, irq);
200 } else {
201 __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
202 }
199 } 203 }
200 204
201 irq_exit(); 205 irq_exit();
@@ -262,6 +266,76 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
262EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 266EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
263 267
264#ifdef CONFIG_HOTPLUG_CPU 268#ifdef CONFIG_HOTPLUG_CPU
269/*
270 * This cpu is going to be removed and its vectors migrated to the remaining
271 * online cpus. Check to see if there are enough vectors in the remaining cpus.
272 * This function is protected by stop_machine().
273 */
274int check_irq_vectors_for_cpu_disable(void)
275{
276 int irq, cpu;
277 unsigned int this_cpu, vector, this_count, count;
278 struct irq_desc *desc;
279 struct irq_data *data;
280 struct cpumask affinity_new, online_new;
281
282 this_cpu = smp_processor_id();
283 cpumask_copy(&online_new, cpu_online_mask);
284 cpu_clear(this_cpu, online_new);
285
286 this_count = 0;
287 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
288 irq = __this_cpu_read(vector_irq[vector]);
289 if (irq >= 0) {
290 desc = irq_to_desc(irq);
291 data = irq_desc_get_irq_data(desc);
292 cpumask_copy(&affinity_new, data->affinity);
293 cpu_clear(this_cpu, affinity_new);
294
295 /* Do not count inactive or per-cpu irqs. */
296 if (!irq_has_action(irq) || irqd_is_per_cpu(data))
297 continue;
298
299 /*
300 * A single irq may be mapped to multiple
301 * cpu's vector_irq[] (for example IOAPIC cluster
302 * mode). In this case we have two
303 * possibilities:
304 *
305 * 1) the resulting affinity mask is empty; that is
306 * this the down'd cpu is the last cpu in the irq's
307 * affinity mask, or
308 *
309 * 2) the resulting affinity mask is no longer
310 * a subset of the online cpus but the affinity
311 * mask is not zero; that is the down'd cpu is the
312 * last online cpu in a user set affinity mask.
313 */
314 if (cpumask_empty(&affinity_new) ||
315 !cpumask_subset(&affinity_new, &online_new))
316 this_count++;
317 }
318 }
319
320 count = 0;
321 for_each_online_cpu(cpu) {
322 if (cpu == this_cpu)
323 continue;
324 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
325 vector++) {
326 if (per_cpu(vector_irq, cpu)[vector] < 0)
327 count++;
328 }
329 }
330
331 if (count < this_count) {
332 pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n",
333 this_cpu, this_count, count);
334 return -ERANGE;
335 }
336 return 0;
337}
338
265/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 339/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
266void fixup_irqs(void) 340void fixup_irqs(void)
267{ 341{
@@ -344,7 +418,7 @@ void fixup_irqs(void)
344 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 418 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
345 unsigned int irr; 419 unsigned int irr;
346 420
347 if (__this_cpu_read(vector_irq[vector]) < 0) 421 if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
348 continue; 422 continue;
349 423
350 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); 424 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -355,11 +429,14 @@ void fixup_irqs(void)
355 data = irq_desc_get_irq_data(desc); 429 data = irq_desc_get_irq_data(desc);
356 chip = irq_data_get_irq_chip(data); 430 chip = irq_data_get_irq_chip(data);
357 raw_spin_lock(&desc->lock); 431 raw_spin_lock(&desc->lock);
358 if (chip->irq_retrigger) 432 if (chip->irq_retrigger) {
359 chip->irq_retrigger(data); 433 chip->irq_retrigger(data);
434 __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
435 }
360 raw_spin_unlock(&desc->lock); 436 raw_spin_unlock(&desc->lock);
361 } 437 }
362 __this_cpu_write(vector_irq[vector], -1); 438 if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
439 __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
363 } 440 }
364} 441}
365#endif 442#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc594ff..7f50156542fb 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
52}; 52};
53 53
54DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 54DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
55 [0 ... NR_VECTORS - 1] = -1, 55 [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
56}; 56};
57 57
58int vector_used_by_percpu_irq(unsigned int vector) 58int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
60 int cpu; 60 int cpu;
61 61
62 for_each_online_cpu(cpu) { 62 for_each_online_cpu(cpu) {
63 if (per_cpu(vector_irq, cpu)[vector] != -1) 63 if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
64 return 1; 64 return 1;
65 } 65 }
66 66
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 836f8322960e..7ec1d5f8d283 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -39,7 +39,6 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
41#include <linux/kgdb.h> 41#include <linux/kgdb.h>
42#include <linux/init.h>
43#include <linux/smp.h> 42#include <linux/smp.h>
44#include <linux/nmi.h> 43#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h> 44#include <linux/hw_breakpoint.h>
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 000000000000..c2bedaea11f7
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,340 @@
1/*
2 * Architecture specific sysfs attributes in /sys/kernel
3 *
4 * Copyright (C) 2007, Intel Corp.
5 * Huang Ying <ying.huang@intel.com>
6 * Copyright (C) 2013, 2013 Red Hat, Inc.
7 * Dave Young <dyoung@redhat.com>
8 *
9 * This file is released under the GPLv2
10 */
11
12#include <linux/kobject.h>
13#include <linux/string.h>
14#include <linux/sysfs.h>
15#include <linux/init.h>
16#include <linux/stat.h>
17#include <linux/slab.h>
18#include <linux/mm.h>
19
20#include <asm/io.h>
21#include <asm/setup.h>
22
23static ssize_t version_show(struct kobject *kobj,
24 struct kobj_attribute *attr, char *buf)
25{
26 return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
27}
28
29static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
30
31static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
32 struct bin_attribute *bin_attr,
33 char *buf, loff_t off, size_t count)
34{
35 memcpy(buf, (void *)&boot_params + off, count);
36 return count;
37}
38
39static struct bin_attribute boot_params_data_attr = {
40 .attr = {
41 .name = "data",
42 .mode = S_IRUGO,
43 },
44 .read = boot_params_data_read,
45 .size = sizeof(boot_params),
46};
47
48static struct attribute *boot_params_version_attrs[] = {
49 &boot_params_version_attr.attr,
50 NULL,
51};
52
53static struct bin_attribute *boot_params_data_attrs[] = {
54 &boot_params_data_attr,
55 NULL,
56};
57
58static struct attribute_group boot_params_attr_group = {
59 .attrs = boot_params_version_attrs,
60 .bin_attrs = boot_params_data_attrs,
61};
62
63static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
64{
65 const char *name;
66
67 name = kobject_name(kobj);
68 return kstrtoint(name, 10, nr);
69}
70
71static int get_setup_data_paddr(int nr, u64 *paddr)
72{
73 int i = 0;
74 struct setup_data *data;
75 u64 pa_data = boot_params.hdr.setup_data;
76
77 while (pa_data) {
78 if (nr == i) {
79 *paddr = pa_data;
80 return 0;
81 }
82 data = ioremap_cache(pa_data, sizeof(*data));
83 if (!data)
84 return -ENOMEM;
85
86 pa_data = data->next;
87 iounmap(data);
88 i++;
89 }
90 return -EINVAL;
91}
92
93static int __init get_setup_data_size(int nr, size_t *size)
94{
95 int i = 0;
96 struct setup_data *data;
97 u64 pa_data = boot_params.hdr.setup_data;
98
99 while (pa_data) {
100 data = ioremap_cache(pa_data, sizeof(*data));
101 if (!data)
102 return -ENOMEM;
103 if (nr == i) {
104 *size = data->len;
105 iounmap(data);
106 return 0;
107 }
108
109 pa_data = data->next;
110 iounmap(data);
111 i++;
112 }
113 return -EINVAL;
114}
115
116static ssize_t type_show(struct kobject *kobj,
117 struct kobj_attribute *attr, char *buf)
118{
119 int nr, ret;
120 u64 paddr;
121 struct setup_data *data;
122
123 ret = kobj_to_setup_data_nr(kobj, &nr);
124 if (ret)
125 return ret;
126
127 ret = get_setup_data_paddr(nr, &paddr);
128 if (ret)
129 return ret;
130 data = ioremap_cache(paddr, sizeof(*data));
131 if (!data)
132 return -ENOMEM;
133
134 ret = sprintf(buf, "0x%x\n", data->type);
135 iounmap(data);
136 return ret;
137}
138
139static ssize_t setup_data_data_read(struct file *fp,
140 struct kobject *kobj,
141 struct bin_attribute *bin_attr,
142 char *buf,
143 loff_t off, size_t count)
144{
145 int nr, ret = 0;
146 u64 paddr;
147 struct setup_data *data;
148 void *p;
149
150 ret = kobj_to_setup_data_nr(kobj, &nr);
151 if (ret)
152 return ret;
153
154 ret = get_setup_data_paddr(nr, &paddr);
155 if (ret)
156 return ret;
157 data = ioremap_cache(paddr, sizeof(*data));
158 if (!data)
159 return -ENOMEM;
160
161 if (off > data->len) {
162 ret = -EINVAL;
163 goto out;
164 }
165
166 if (count > data->len - off)
167 count = data->len - off;
168
169 if (!count)
170 goto out;
171
172 ret = count;
173 p = ioremap_cache(paddr + sizeof(*data), data->len);
174 if (!p) {
175 ret = -ENOMEM;
176 goto out;
177 }
178 memcpy(buf, p + off, count);
179 iounmap(p);
180out:
181 iounmap(data);
182 return ret;
183}
184
185static struct kobj_attribute type_attr = __ATTR_RO(type);
186
187static struct bin_attribute data_attr = {
188 .attr = {
189 .name = "data",
190 .mode = S_IRUGO,
191 },
192 .read = setup_data_data_read,
193};
194
195static struct attribute *setup_data_type_attrs[] = {
196 &type_attr.attr,
197 NULL,
198};
199
200static struct bin_attribute *setup_data_data_attrs[] = {
201 &data_attr,
202 NULL,
203};
204
205static struct attribute_group setup_data_attr_group = {
206 .attrs = setup_data_type_attrs,
207 .bin_attrs = setup_data_data_attrs,
208};
209
210static int __init create_setup_data_node(struct kobject *parent,
211 struct kobject **kobjp, int nr)
212{
213 int ret = 0;
214 size_t size;
215 struct kobject *kobj;
216 char name[16]; /* should be enough for setup_data nodes numbers */
217 snprintf(name, 16, "%d", nr);
218
219 kobj = kobject_create_and_add(name, parent);
220 if (!kobj)
221 return -ENOMEM;
222
223 ret = get_setup_data_size(nr, &size);
224 if (ret)
225 goto out_kobj;
226
227 data_attr.size = size;
228 ret = sysfs_create_group(kobj, &setup_data_attr_group);
229 if (ret)
230 goto out_kobj;
231 *kobjp = kobj;
232
233 return 0;
234out_kobj:
235 kobject_put(kobj);
236 return ret;
237}
238
239static void __init cleanup_setup_data_node(struct kobject *kobj)
240{
241 sysfs_remove_group(kobj, &setup_data_attr_group);
242 kobject_put(kobj);
243}
244
245static int __init get_setup_data_total_num(u64 pa_data, int *nr)
246{
247 int ret = 0;
248 struct setup_data *data;
249
250 *nr = 0;
251 while (pa_data) {
252 *nr += 1;
253 data = ioremap_cache(pa_data, sizeof(*data));
254 if (!data) {
255 ret = -ENOMEM;
256 goto out;
257 }
258 pa_data = data->next;
259 iounmap(data);
260 }
261
262out:
263 return ret;
264}
265
266static int __init create_setup_data_nodes(struct kobject *parent)
267{
268 struct kobject *setup_data_kobj, **kobjp;
269 u64 pa_data;
270 int i, j, nr, ret = 0;
271
272 pa_data = boot_params.hdr.setup_data;
273 if (!pa_data)
274 return 0;
275
276 setup_data_kobj = kobject_create_and_add("setup_data", parent);
277 if (!setup_data_kobj) {
278 ret = -ENOMEM;
279 goto out;
280 }
281
282 ret = get_setup_data_total_num(pa_data, &nr);
283 if (ret)
284 goto out_setup_data_kobj;
285
286 kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL);
287 if (!kobjp) {
288 ret = -ENOMEM;
289 goto out_setup_data_kobj;
290 }
291
292 for (i = 0; i < nr; i++) {
293 ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
294 if (ret)
295 goto out_clean_nodes;
296 }
297
298 kfree(kobjp);
299 return 0;
300
301out_clean_nodes:
302 for (j = i - 1; j > 0; j--)
303 cleanup_setup_data_node(*(kobjp + j));
304 kfree(kobjp);
305out_setup_data_kobj:
306 kobject_put(setup_data_kobj);
307out:
308 return ret;
309}
310
311static int __init boot_params_ksysfs_init(void)
312{
313 int ret;
314 struct kobject *boot_params_kobj;
315
316 boot_params_kobj = kobject_create_and_add("boot_params",
317 kernel_kobj);
318 if (!boot_params_kobj) {
319 ret = -ENOMEM;
320 goto out;
321 }
322
323 ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
324 if (ret)
325 goto out_boot_params_kobj;
326
327 ret = create_setup_data_nodes(boot_params_kobj);
328 if (ret)
329 goto out_create_group;
330
331 return 0;
332out_create_group:
333 sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
334out_boot_params_kobj:
335 kobject_put(boot_params_kobj);
336out:
337 return ret;
338}
339
340arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 5b19e4d78b00..1667b1de8d5d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -9,7 +9,6 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/delay.h> 11#include <linux/delay.h>
12#include <linux/init.h>
13#include <linux/numa.h> 12#include <linux/numa.h>
14#include <linux/ftrace.h> 13#include <linux/ftrace.h>
15#include <linux/suspend.h> 14#include <linux/suspend.h>
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 871be4a84c7d..da15918d1c81 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -3,7 +3,6 @@
3#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/init.h>
7#include <linux/gfp.h> 6#include <linux/gfp.h>
8#include <linux/pci.h> 7#include <linux/pci.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6f1236c29c4b..0de43e98ce08 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -24,7 +24,6 @@
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/reboot.h> 26#include <linux/reboot.h>
27#include <linux/init.h>
28#include <linux/mc146818rtc.h> 27#include <linux/mc146818rtc.h>
29#include <linux/module.h> 28#include <linux/module.h>
30#include <linux/kallsyms.h> 29#include <linux/kallsyms.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cb233bc9dee3..c9675594d7ca 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -295,6 +295,8 @@ static void __init reserve_brk(void)
295 _brk_start = 0; 295 _brk_start = 0;
296} 296}
297 297
298u64 relocated_ramdisk;
299
298#ifdef CONFIG_BLK_DEV_INITRD 300#ifdef CONFIG_BLK_DEV_INITRD
299 301
300static u64 __init get_ramdisk_image(void) 302static u64 __init get_ramdisk_image(void)
@@ -321,25 +323,24 @@ static void __init relocate_initrd(void)
321 u64 ramdisk_image = get_ramdisk_image(); 323 u64 ramdisk_image = get_ramdisk_image();
322 u64 ramdisk_size = get_ramdisk_size(); 324 u64 ramdisk_size = get_ramdisk_size();
323 u64 area_size = PAGE_ALIGN(ramdisk_size); 325 u64 area_size = PAGE_ALIGN(ramdisk_size);
324 u64 ramdisk_here;
325 unsigned long slop, clen, mapaddr; 326 unsigned long slop, clen, mapaddr;
326 char *p, *q; 327 char *p, *q;
327 328
328 /* We need to move the initrd down into directly mapped mem */ 329 /* We need to move the initrd down into directly mapped mem */
329 ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 330 relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
330 area_size, PAGE_SIZE); 331 area_size, PAGE_SIZE);
331 332
332 if (!ramdisk_here) 333 if (!relocated_ramdisk)
333 panic("Cannot find place for new RAMDISK of size %lld\n", 334 panic("Cannot find place for new RAMDISK of size %lld\n",
334 ramdisk_size); 335 ramdisk_size);
335 336
336 /* Note: this includes all the mem currently occupied by 337 /* Note: this includes all the mem currently occupied by
337 the initrd, we rely on that fact to keep the data intact. */ 338 the initrd, we rely on that fact to keep the data intact. */
338 memblock_reserve(ramdisk_here, area_size); 339 memblock_reserve(relocated_ramdisk, area_size);
339 initrd_start = ramdisk_here + PAGE_OFFSET; 340 initrd_start = relocated_ramdisk + PAGE_OFFSET;
340 initrd_end = initrd_start + ramdisk_size; 341 initrd_end = initrd_start + ramdisk_size;
341 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", 342 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
342 ramdisk_here, ramdisk_here + ramdisk_size - 1); 343 relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
343 344
344 q = (char *)initrd_start; 345 q = (char *)initrd_start;
345 346
@@ -363,7 +364,7 @@ static void __init relocate_initrd(void)
363 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 364 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
364 " [mem %#010llx-%#010llx]\n", 365 " [mem %#010llx-%#010llx]\n",
365 ramdisk_image, ramdisk_image + ramdisk_size - 1, 366 ramdisk_image, ramdisk_image + ramdisk_size - 1,
366 ramdisk_here, ramdisk_here + ramdisk_size - 1); 367 relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
367} 368}
368 369
369static void __init early_reserve_initrd(void) 370static void __init early_reserve_initrd(void)
@@ -447,6 +448,9 @@ static void __init parse_setup_data(void)
447 case SETUP_DTB: 448 case SETUP_DTB:
448 add_dtb(pa_data); 449 add_dtb(pa_data);
449 break; 450 break;
451 case SETUP_EFI:
452 parse_efi_setup(pa_data, data_len);
453 break;
450 default: 454 default:
451 break; 455 break;
452 } 456 }
@@ -824,6 +828,20 @@ static void __init trim_low_memory_range(void)
824} 828}
825 829
826/* 830/*
831 * Dump out kernel offset information on panic.
832 */
833static int
834dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
835{
836 pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
837 "(relocation range: 0x%lx-0x%lx)\n",
838 (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
839 __START_KERNEL_map, MODULES_VADDR-1);
840
841 return 0;
842}
843
844/*
827 * Determine if we were loaded by an EFI loader. If so, then we have also been 845 * Determine if we were loaded by an EFI loader. If so, then we have also been
828 * passed the efi memmap, systab, etc., so we should use these data structures 846 * passed the efi memmap, systab, etc., so we should use these data structures
829 * for initialization. Note, the efi init code path is determined by the 847 * for initialization. Note, the efi init code path is determined by the
@@ -924,8 +942,6 @@ void __init setup_arch(char **cmdline_p)
924 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; 942 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
925 setup_memory_map(); 943 setup_memory_map();
926 parse_setup_data(); 944 parse_setup_data();
927 /* update the e820_saved too */
928 e820_reserve_setup_data();
929 945
930 copy_edd(); 946 copy_edd();
931 947
@@ -987,6 +1003,8 @@ void __init setup_arch(char **cmdline_p)
987 early_dump_pci_devices(); 1003 early_dump_pci_devices();
988#endif 1004#endif
989 1005
1006 /* update the e820_saved too */
1007 e820_reserve_setup_data();
990 finish_e820_parsing(); 1008 finish_e820_parsing();
991 1009
992 if (efi_enabled(EFI_BOOT)) 1010 if (efi_enabled(EFI_BOOT))
@@ -1101,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
1101 1119
1102 setup_real_mode(); 1120 setup_real_mode();
1103 1121
1104 memblock_set_current_limit(get_max_mapped()); 1122 memblock_set_current_limit(get_max_low_mapped());
1105 dma_contiguous_reserve(0); 1123 dma_contiguous_reserve(0);
1106 1124
1107 /* 1125 /*
@@ -1248,3 +1266,15 @@ void __init i386_reserve_resources(void)
1248} 1266}
1249 1267
1250#endif /* CONFIG_X86_32 */ 1268#endif /* CONFIG_X86_32 */
1269
1270static struct notifier_block kernel_offset_notifier = {
1271 .notifier_call = dump_kernel_offset
1272};
1273
1274static int __init register_kernel_offset_dumper(void)
1275{
1276 atomic_notifier_chain_register(&panic_notifier_list,
1277 &kernel_offset_notifier);
1278 return 0;
1279}
1280__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85dc05a3aa02..a32da804252e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1312,6 +1312,12 @@ void cpu_disable_common(void)
1312 1312
1313int native_cpu_disable(void) 1313int native_cpu_disable(void)
1314{ 1314{
1315 int ret;
1316
1317 ret = check_irq_vectors_for_cpu_disable();
1318 if (ret)
1319 return ret;
1320
1315 clear_local_APIC(); 1321 clear_local_APIC();
1316 1322
1317 cpu_disable_common(); 1323 cpu_disable_common();
@@ -1417,7 +1423,9 @@ static inline void mwait_play_dead(void)
1417 * The WBINVD is insufficient due to the spurious-wakeup 1423 * The WBINVD is insufficient due to the spurious-wakeup
1418 * case where we return around the loop. 1424 * case where we return around the loop.
1419 */ 1425 */
1426 mb();
1420 clflush(mwait_ptr); 1427 clflush(mwait_ptr);
1428 mb();
1421 __monitor(mwait_ptr, 0, 0); 1429 __monitor(mwait_ptr, 0, 0);
1422 mb(); 1430 mb();
1423 __mwait(eax, 0); 1431 __mwait(eax, 0);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b857ed890b4c..57409f6b8c62 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -211,21 +211,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
211 exception_exit(prev_state); \ 211 exception_exit(prev_state); \
212} 212}
213 213
214DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, 214DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip )
215 regs->ip) 215DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow )
216DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) 216DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds )
217DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) 217DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip )
218DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, 218DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun )
219 regs->ip) 219DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS )
220DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", 220DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present )
221 coprocessor_segment_overrun)
222DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
223DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
224#ifdef CONFIG_X86_32 221#ifdef CONFIG_X86_32
225DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) 222DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment )
226#endif 223#endif
227DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, 224DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 )
228 BUS_ADRALN, 0)
229 225
230#ifdef CONFIG_X86_64 226#ifdef CONFIG_X86_64
231/* Runs on IST stack */ 227/* Runs on IST stack */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d48f560..a3acbac2ee72 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
11#include <linux/clocksource.h> 11#include <linux/clocksource.h>
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/timex.h> 13#include <linux/timex.h>
14#include <linux/static_key.h>
14 15
15#include <asm/hpet.h> 16#include <asm/hpet.h>
16#include <asm/timer.h> 17#include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
37 erroneous rdtsc usage on !cpu_has_tsc processors */ 38 erroneous rdtsc usage on !cpu_has_tsc processors */
38static int __read_mostly tsc_disabled = -1; 39static int __read_mostly tsc_disabled = -1;
39 40
41static struct static_key __use_tsc = STATIC_KEY_INIT;
42
40int tsc_clocksource_reliable; 43int tsc_clocksource_reliable;
44
45/*
46 * Use a ring-buffer like data structure, where a writer advances the head by
47 * writing a new data entry and a reader advances the tail when it observes a
48 * new entry.
49 *
50 * Writers are made to wait on readers until there's space to write a new
51 * entry.
52 *
53 * This means that we can always use an {offset, mul} pair to compute a ns
54 * value that is 'roughly' in the right direction, even if we're writing a new
55 * {offset, mul} pair during the clock read.
56 *
57 * The down-side is that we can no longer guarantee strict monotonicity anymore
58 * (assuming the TSC was that to begin with), because while we compute the
59 * intersection point of the two clock slopes and make sure the time is
60 * continuous at the point of switching; we can no longer guarantee a reader is
61 * strictly before or after the switch point.
62 *
63 * It does mean a reader no longer needs to disable IRQs in order to avoid
64 * CPU-Freq updates messing with his times, and similarly an NMI reader will
65 * no longer run the risk of hitting half-written state.
66 */
67
68struct cyc2ns {
69 struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
70 struct cyc2ns_data *head; /* 48 + 8 = 56 */
71 struct cyc2ns_data *tail; /* 56 + 8 = 64 */
72}; /* exactly fits one cacheline */
73
74static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
75
76struct cyc2ns_data *cyc2ns_read_begin(void)
77{
78 struct cyc2ns_data *head;
79
80 preempt_disable();
81
82 head = this_cpu_read(cyc2ns.head);
83 /*
84 * Ensure we observe the entry when we observe the pointer to it.
85 * matches the wmb from cyc2ns_write_end().
86 */
87 smp_read_barrier_depends();
88 head->__count++;
89 barrier();
90
91 return head;
92}
93
94void cyc2ns_read_end(struct cyc2ns_data *head)
95{
96 barrier();
97 /*
98 * If we're the outer most nested read; update the tail pointer
99 * when we're done. This notifies possible pending writers
100 * that we've observed the head pointer and that the other
101 * entry is now free.
102 */
103 if (!--head->__count) {
104 /*
105 * x86-TSO does not reorder writes with older reads;
106 * therefore once this write becomes visible to another
107 * cpu, we must be finished reading the cyc2ns_data.
108 *
109 * matches with cyc2ns_write_begin().
110 */
111 this_cpu_write(cyc2ns.tail, head);
112 }
113 preempt_enable();
114}
115
116/*
117 * Begin writing a new @data entry for @cpu.
118 *
119 * Assumes some sort of write side lock; currently 'provided' by the assumption
120 * that cpufreq will call its notifiers sequentially.
121 */
122static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
123{
124 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
125 struct cyc2ns_data *data = c2n->data;
126
127 if (data == c2n->head)
128 data++;
129
130 /* XXX send an IPI to @cpu in order to guarantee a read? */
131
132 /*
133 * When we observe the tail write from cyc2ns_read_end(),
134 * the cpu must be done with that entry and its safe
135 * to start writing to it.
136 */
137 while (c2n->tail == data)
138 cpu_relax();
139
140 return data;
141}
142
143static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
144{
145 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
146
147 /*
148 * Ensure the @data writes are visible before we publish the
149 * entry. Matches the data-depencency in cyc2ns_read_begin().
150 */
151 smp_wmb();
152
153 ACCESS_ONCE(c2n->head) = data;
154}
155
156/*
157 * Accelerators for sched_clock()
158 * convert from cycles(64bits) => nanoseconds (64bits)
159 * basic equation:
160 * ns = cycles / (freq / ns_per_sec)
161 * ns = cycles * (ns_per_sec / freq)
162 * ns = cycles * (10^9 / (cpu_khz * 10^3))
163 * ns = cycles * (10^6 / cpu_khz)
164 *
165 * Then we use scaling math (suggested by george@mvista.com) to get:
166 * ns = cycles * (10^6 * SC / cpu_khz) / SC
167 * ns = cycles * cyc2ns_scale / SC
168 *
169 * And since SC is a constant power of two, we can convert the div
170 * into a shift.
171 *
172 * We can use khz divisor instead of mhz to keep a better precision, since
173 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
174 * (mathieu.desnoyers@polymtl.ca)
175 *
176 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
177 */
178
179#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
180
181static void cyc2ns_data_init(struct cyc2ns_data *data)
182{
183 data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
184 data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
185 data->cyc2ns_offset = 0;
186 data->__count = 0;
187}
188
189static void cyc2ns_init(int cpu)
190{
191 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
192
193 cyc2ns_data_init(&c2n->data[0]);
194 cyc2ns_data_init(&c2n->data[1]);
195
196 c2n->head = c2n->data;
197 c2n->tail = c2n->data;
198}
199
200static inline unsigned long long cycles_2_ns(unsigned long long cyc)
201{
202 struct cyc2ns_data *data, *tail;
203 unsigned long long ns;
204
205 /*
206 * See cyc2ns_read_*() for details; replicated in order to avoid
207 * an extra few instructions that came with the abstraction.
208 * Notable, it allows us to only do the __count and tail update
209 * dance when its actually needed.
210 */
211
212 preempt_disable();
213 data = this_cpu_read(cyc2ns.head);
214 tail = this_cpu_read(cyc2ns.tail);
215
216 if (likely(data == tail)) {
217 ns = data->cyc2ns_offset;
218 ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
219 } else {
220 data->__count++;
221
222 barrier();
223
224 ns = data->cyc2ns_offset;
225 ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
226
227 barrier();
228
229 if (!--data->__count)
230 this_cpu_write(cyc2ns.tail, data);
231 }
232 preempt_enable();
233
234 return ns;
235}
236
237/* XXX surely we already have this someplace in the kernel?! */
238#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
239
240static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
241{
242 unsigned long long tsc_now, ns_now;
243 struct cyc2ns_data *data;
244 unsigned long flags;
245
246 local_irq_save(flags);
247 sched_clock_idle_sleep_event();
248
249 if (!cpu_khz)
250 goto done;
251
252 data = cyc2ns_write_begin(cpu);
253
254 rdtscll(tsc_now);
255 ns_now = cycles_2_ns(tsc_now);
256
257 /*
258 * Compute a new multiplier as per the above comment and ensure our
259 * time function is continuous; see the comment near struct
260 * cyc2ns_data.
261 */
262 data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
263 data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
264 data->cyc2ns_offset = ns_now -
265 mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
266
267 cyc2ns_write_end(cpu, data);
268
269done:
270 sched_clock_idle_wakeup_event(0);
271 local_irq_restore(flags);
272}
41/* 273/*
42 * Scheduler clock - returns current time in nanosec units. 274 * Scheduler clock - returns current time in nanosec units.
43 */ 275 */
44u64 native_sched_clock(void) 276u64 native_sched_clock(void)
45{ 277{
46 u64 this_offset; 278 u64 tsc_now;
47 279
48 /* 280 /*
49 * Fall back to jiffies if there's no TSC available: 281 * Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
53 * very important for it to be as fast as the platform 285 * very important for it to be as fast as the platform
54 * can achieve it. ) 286 * can achieve it. )
55 */ 287 */
56 if (unlikely(tsc_disabled)) { 288 if (!static_key_false(&__use_tsc)) {
57 /* No locking but a rare wrong value is not a big deal: */ 289 /* No locking but a rare wrong value is not a big deal: */
58 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 290 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
59 } 291 }
60 292
61 /* read the Time Stamp Counter: */ 293 /* read the Time Stamp Counter: */
62 rdtscll(this_offset); 294 rdtscll(tsc_now);
63 295
64 /* return the value in ns */ 296 /* return the value in ns */
65 return __cycles_2_ns(this_offset); 297 return cycles_2_ns(tsc_now);
66} 298}
67 299
68/* We need to define a real function for sched_clock, to override the 300/* We need to define a real function for sched_clock, to override the
@@ -419,6 +651,16 @@ unsigned long native_calibrate_tsc(void)
419 unsigned long flags, latch, ms, fast_calibrate; 651 unsigned long flags, latch, ms, fast_calibrate;
420 int hpet = is_hpet_enabled(), i, loopmin; 652 int hpet = is_hpet_enabled(), i, loopmin;
421 653
654 /* Calibrate TSC using MSR for Intel Atom SoCs */
655 local_irq_save(flags);
656 i = try_msr_calibrate_tsc(&fast_calibrate);
657 local_irq_restore(flags);
658 if (i >= 0) {
659 if (i == 0)
660 pr_warn("Fast TSC calibration using MSR failed\n");
661 return fast_calibrate;
662 }
663
422 local_irq_save(flags); 664 local_irq_save(flags);
423 fast_calibrate = quick_pit_calibrate(); 665 fast_calibrate = quick_pit_calibrate();
424 local_irq_restore(flags); 666 local_irq_restore(flags);
@@ -589,61 +831,11 @@ int recalibrate_cpu_khz(void)
589EXPORT_SYMBOL(recalibrate_cpu_khz); 831EXPORT_SYMBOL(recalibrate_cpu_khz);
590 832
591 833
592/* Accelerators for sched_clock()
593 * convert from cycles(64bits) => nanoseconds (64bits)
594 * basic equation:
595 * ns = cycles / (freq / ns_per_sec)
596 * ns = cycles * (ns_per_sec / freq)
597 * ns = cycles * (10^9 / (cpu_khz * 10^3))
598 * ns = cycles * (10^6 / cpu_khz)
599 *
600 * Then we use scaling math (suggested by george@mvista.com) to get:
601 * ns = cycles * (10^6 * SC / cpu_khz) / SC
602 * ns = cycles * cyc2ns_scale / SC
603 *
604 * And since SC is a constant power of two, we can convert the div
605 * into a shift.
606 *
607 * We can use khz divisor instead of mhz to keep a better precision, since
608 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
609 * (mathieu.desnoyers@polymtl.ca)
610 *
611 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
612 */
613
614DEFINE_PER_CPU(unsigned long, cyc2ns);
615DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
616
617static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
618{
619 unsigned long long tsc_now, ns_now, *offset;
620 unsigned long flags, *scale;
621
622 local_irq_save(flags);
623 sched_clock_idle_sleep_event();
624
625 scale = &per_cpu(cyc2ns, cpu);
626 offset = &per_cpu(cyc2ns_offset, cpu);
627
628 rdtscll(tsc_now);
629 ns_now = __cycles_2_ns(tsc_now);
630
631 if (cpu_khz) {
632 *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
633 cpu_khz / 2) / cpu_khz;
634 *offset = ns_now - mult_frac(tsc_now, *scale,
635 (1UL << CYC2NS_SCALE_FACTOR));
636 }
637
638 sched_clock_idle_wakeup_event(0);
639 local_irq_restore(flags);
640}
641
642static unsigned long long cyc2ns_suspend; 834static unsigned long long cyc2ns_suspend;
643 835
644void tsc_save_sched_clock_state(void) 836void tsc_save_sched_clock_state(void)
645{ 837{
646 if (!sched_clock_stable) 838 if (!sched_clock_stable())
647 return; 839 return;
648 840
649 cyc2ns_suspend = sched_clock(); 841 cyc2ns_suspend = sched_clock();
@@ -663,16 +855,26 @@ void tsc_restore_sched_clock_state(void)
663 unsigned long flags; 855 unsigned long flags;
664 int cpu; 856 int cpu;
665 857
666 if (!sched_clock_stable) 858 if (!sched_clock_stable())
667 return; 859 return;
668 860
669 local_irq_save(flags); 861 local_irq_save(flags);
670 862
671 __this_cpu_write(cyc2ns_offset, 0); 863 /*
864 * We're comming out of suspend, there's no concurrency yet; don't
865 * bother being nice about the RCU stuff, just write to both
866 * data fields.
867 */
868
869 this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
870 this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
871
672 offset = cyc2ns_suspend - sched_clock(); 872 offset = cyc2ns_suspend - sched_clock();
673 873
674 for_each_possible_cpu(cpu) 874 for_each_possible_cpu(cpu) {
675 per_cpu(cyc2ns_offset, cpu) = offset; 875 per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
876 per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
877 }
676 878
677 local_irq_restore(flags); 879 local_irq_restore(flags);
678} 880}
@@ -795,7 +997,7 @@ void mark_tsc_unstable(char *reason)
795{ 997{
796 if (!tsc_unstable) { 998 if (!tsc_unstable) {
797 tsc_unstable = 1; 999 tsc_unstable = 1;
798 sched_clock_stable = 0; 1000 clear_sched_clock_stable();
799 disable_sched_clock_irqtime(); 1001 disable_sched_clock_irqtime();
800 pr_info("Marking TSC unstable due to %s\n", reason); 1002 pr_info("Marking TSC unstable due to %s\n", reason);
801 /* Change only the rating, when not registered */ 1003 /* Change only the rating, when not registered */
@@ -995,14 +1197,18 @@ void __init tsc_init(void)
995 * speed as the bootup CPU. (cpufreq notifiers will fix this 1197 * speed as the bootup CPU. (cpufreq notifiers will fix this
996 * up if their speed diverges) 1198 * up if their speed diverges)
997 */ 1199 */
998 for_each_possible_cpu(cpu) 1200 for_each_possible_cpu(cpu) {
1201 cyc2ns_init(cpu);
999 set_cyc2ns_scale(cpu_khz, cpu); 1202 set_cyc2ns_scale(cpu_khz, cpu);
1203 }
1000 1204
1001 if (tsc_disabled > 0) 1205 if (tsc_disabled > 0)
1002 return; 1206 return;
1003 1207
1004 /* now allow native_sched_clock() to use rdtsc */ 1208 /* now allow native_sched_clock() to use rdtsc */
1209
1005 tsc_disabled = 0; 1210 tsc_disabled = 0;
1211 static_key_slow_inc(&__use_tsc);
1006 1212
1007 if (!no_sched_irq_time) 1213 if (!no_sched_irq_time)
1008 enable_sched_clock_irqtime(); 1214 enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 000000000000..8b5434f4389f
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,127 @@
1/*
2 * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms.
3 *
4 * TSC in Intel Atom SoC runs at a constant rate which can be figured
5 * by this formula:
6 * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
7 * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
8 * for details.
9 * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR
10 * based calibration is the only option.
11 *
12 *
13 * Copyright (C) 2013 Intel Corporation
14 * Author: Bin Gao <bin.gao@intel.com>
15 *
16 * This file is released under the GPLv2.
17 */
18
19#include <linux/kernel.h>
20#include <asm/processor.h>
21#include <asm/setup.h>
22#include <asm/apic.h>
23#include <asm/param.h>
24
25/* CPU reference clock frequency: in KHz */
26#define FREQ_83 83200
27#define FREQ_100 99840
28#define FREQ_133 133200
29#define FREQ_166 166400
30
31#define MAX_NUM_FREQS 8
32
33/*
34 * According to Intel 64 and IA-32 System Programming Guide,
35 * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
36 * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
37 * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
38 * so we need manually differentiate SoC families. This is what the
39 * field msr_plat does.
40 */
41struct freq_desc {
42 u8 x86_family; /* CPU family */
43 u8 x86_model; /* model */
44 u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
45 u32 freqs[MAX_NUM_FREQS];
46};
47
48static struct freq_desc freq_desc_tables[] = {
49 /* PNW */
50 { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
51 /* CLV+ */
52 { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
53 /* TNG */
54 { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } },
55 /* VLV2 */
56 { 6, 0x37, 1, { 0, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
57 /* ANN */
58 { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
59};
60
61static int match_cpu(u8 family, u8 model)
62{
63 int i;
64
65 for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) {
66 if ((family == freq_desc_tables[i].x86_family) &&
67 (model == freq_desc_tables[i].x86_model))
68 return i;
69 }
70
71 return -1;
72}
73
74/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
75#define id_to_freq(cpu_index, freq_id) \
76 (freq_desc_tables[cpu_index].freqs[freq_id])
77
78/*
79 * Do MSR calibration only for known/supported CPUs.
80 * Return values:
81 * -1: CPU is unknown/unsupported for MSR based calibration
82 * 0: CPU is known/supported, but calibration failed
83 * 1: CPU is known/supported, and calibration succeeded
84 */
85int try_msr_calibrate_tsc(unsigned long *fast_calibrate)
86{
87 int cpu_index;
88 u32 lo, hi, ratio, freq_id, freq;
89
90 cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
91 if (cpu_index < 0)
92 return -1;
93
94 *fast_calibrate = 0;
95
96 if (freq_desc_tables[cpu_index].msr_plat) {
97 rdmsr(MSR_PLATFORM_INFO, lo, hi);
98 ratio = (lo >> 8) & 0x1f;
99 } else {
100 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
101 ratio = (hi >> 8) & 0x1f;
102 }
103 pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio);
104
105 if (!ratio)
106 return 0;
107
108 /* Get FSB FREQ ID */
109 rdmsr(MSR_FSB_FREQ, lo, hi);
110 freq_id = lo & 0x7;
111 freq = id_to_freq(cpu_index, freq_id);
112 pr_info("Resolved frequency ID: %u, frequency: %u KHz\n",
113 freq_id, freq);
114 if (!freq)
115 return 0;
116
117 /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
118 *fast_calibrate = freq * ratio;
119 pr_info("TSC runs at %lu KHz\n", *fast_calibrate);
120
121#ifdef CONFIG_X86_LOCAL_APIC
122 lapic_timer_frequency = (freq * 1000) / HZ;
123 pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
124#endif
125
126 return 1;
127}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index adfdf56a3714..26488487bc61 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -16,7 +16,6 @@
16 */ 16 */
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/smp.h> 19#include <linux/smp.h>
21#include <linux/nmi.h> 20#include <linux/nmi.h>
22#include <asm/tsc.h> 21#include <asm/tsc.h>
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 021783b1f46a..e48b674639cc 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -136,9 +136,9 @@ void arch_teardown_msi_irq(unsigned int irq)
136 x86_msi.teardown_msi_irq(irq); 136 x86_msi.teardown_msi_irq(irq);
137} 137}
138 138
139void arch_restore_msi_irqs(struct pci_dev *dev, int irq) 139void arch_restore_msi_irqs(struct pci_dev *dev)
140{ 140{
141 x86_msi.restore_msi_irqs(dev, irq); 141 x86_msi.restore_msi_irqs(dev);
142} 142}
143u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) 143u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
144{ 144{
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 422fd8223470..a4b451c6addf 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void)
562 if (cpu_has_xsaveopt && eagerfpu != DISABLE) 562 if (cpu_has_xsaveopt && eagerfpu != DISABLE)
563 eagerfpu = ENABLE; 563 eagerfpu = ENABLE;
564 564
565 if (pcntxt_mask & XSTATE_EAGER) {
566 if (eagerfpu == DISABLE) {
567 pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
568 pcntxt_mask & XSTATE_EAGER);
569 pcntxt_mask &= ~XSTATE_EAGER;
570 } else {
571 eagerfpu = ENABLE;
572 }
573 }
574
565 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", 575 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
566 pcntxt_mask, xstate_size); 576 pcntxt_mask, xstate_size);
567} 577}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b89c5db2b832..287e4c85fff9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -80,7 +80,7 @@ config KVM_MMU_AUDIT
80 depends on KVM && TRACEPOINTS 80 depends on KVM && TRACEPOINTS
81 ---help--- 81 ---help---
82 This option adds a R/W kVM module parameter 'mmu_audit', which allows 82 This option adds a R/W kVM module parameter 'mmu_audit', which allows
83 audit KVM MMU at runtime. 83 auditing of KVM MMU events at runtime.
84 84
85config KVM_DEVICE_ASSIGNMENT 85config KVM_DEVICE_ASSIGNMENT
86 bool "KVM legacy PCI device assignment support" 86 bool "KVM legacy PCI device assignment support"
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 412a5aa0ef94..518d86471b76 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -37,6 +37,7 @@
37 37
38#include "irq.h" 38#include "irq.h"
39#include "i8254.h" 39#include "i8254.h"
40#include "x86.h"
40 41
41#ifndef CONFIG_X86_64 42#ifndef CONFIG_X86_64
42#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) 43#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -349,6 +350,23 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
349 atomic_set(&ps->pending, 0); 350 atomic_set(&ps->pending, 0);
350 ps->irq_ack = 1; 351 ps->irq_ack = 1;
351 352
353 /*
354 * Do not allow the guest to program periodic timers with small
355 * interval, since the hrtimers are not throttled by the host
356 * scheduler.
357 */
358 if (ps->is_periodic) {
359 s64 min_period = min_timer_period_us * 1000LL;
360
361 if (ps->period < min_period) {
362 pr_info_ratelimited(
363 "kvm: requested %lld ns "
364 "i8254 timer period limited to %lld ns\n",
365 ps->period, min_period);
366 ps->period = min_period;
367 }
368 }
369
352 hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), 370 hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
353 HRTIMER_MODE_ABS); 371 HRTIMER_MODE_ABS);
354} 372}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1673940cf9c3..9736529ade08 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -71,9 +71,6 @@
71#define VEC_POS(v) ((v) & (32 - 1)) 71#define VEC_POS(v) ((v) & (32 - 1))
72#define REG_POS(v) (((v) >> 5) << 4) 72#define REG_POS(v) (((v) >> 5) << 4)
73 73
74static unsigned int min_timer_period_us = 500;
75module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
76
77static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) 74static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
78{ 75{
79 *((u32 *) (apic->regs + reg_off)) = val; 76 *((u32 *) (apic->regs + reg_off)) = val;
@@ -435,7 +432,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
435 u8 val; 432 u8 val;
436 if (pv_eoi_get_user(vcpu, &val) < 0) 433 if (pv_eoi_get_user(vcpu, &val) < 0)
437 apic_debug("Can't read EOI MSR value: 0x%llx\n", 434 apic_debug("Can't read EOI MSR value: 0x%llx\n",
438 (unsigned long long)vcpi->arch.pv_eoi.msr_val); 435 (unsigned long long)vcpu->arch.pv_eoi.msr_val);
439 return val & 0x1; 436 return val & 0x1;
440} 437}
441 438
@@ -443,7 +440,7 @@ static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
443{ 440{
444 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { 441 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
445 apic_debug("Can't set EOI MSR value: 0x%llx\n", 442 apic_debug("Can't set EOI MSR value: 0x%llx\n",
446 (unsigned long long)vcpi->arch.pv_eoi.msr_val); 443 (unsigned long long)vcpu->arch.pv_eoi.msr_val);
447 return; 444 return;
448 } 445 }
449 __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 446 __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
@@ -453,7 +450,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
453{ 450{
454 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { 451 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
455 apic_debug("Can't clear EOI MSR value: 0x%llx\n", 452 apic_debug("Can't clear EOI MSR value: 0x%llx\n",
456 (unsigned long long)vcpi->arch.pv_eoi.msr_val); 453 (unsigned long long)vcpu->arch.pv_eoi.msr_val);
457 return; 454 return;
458 } 455 }
459 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 456 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
@@ -1355,7 +1352,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1355 vcpu->arch.apic_base = value; 1352 vcpu->arch.apic_base = value;
1356 1353
1357 /* update jump label if enable bit changes */ 1354 /* update jump label if enable bit changes */
1358 if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) { 1355 if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
1359 if (value & MSR_IA32_APICBASE_ENABLE) 1356 if (value & MSR_IA32_APICBASE_ENABLE)
1360 static_key_slow_dec_deferred(&apic_hw_disabled); 1357 static_key_slow_dec_deferred(&apic_hw_disabled);
1361 else 1358 else
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 40772ef0f2b1..e50425d0f5f7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2659,6 +2659,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2659 int emulate = 0; 2659 int emulate = 0;
2660 gfn_t pseudo_gfn; 2660 gfn_t pseudo_gfn;
2661 2661
2662 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2663 return 0;
2664
2662 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2665 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2663 if (iterator.level == level) { 2666 if (iterator.level == level) {
2664 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, 2667 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
@@ -2829,6 +2832,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2829 bool ret = false; 2832 bool ret = false;
2830 u64 spte = 0ull; 2833 u64 spte = 0ull;
2831 2834
2835 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2836 return false;
2837
2832 if (!page_fault_can_be_fast(error_code)) 2838 if (!page_fault_can_be_fast(error_code))
2833 return false; 2839 return false;
2834 2840
@@ -3224,6 +3230,9 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
3224 struct kvm_shadow_walk_iterator iterator; 3230 struct kvm_shadow_walk_iterator iterator;
3225 u64 spte = 0ull; 3231 u64 spte = 0ull;
3226 3232
3233 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3234 return spte;
3235
3227 walk_shadow_page_lockless_begin(vcpu); 3236 walk_shadow_page_lockless_begin(vcpu);
3228 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) 3237 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
3229 if (!is_shadow_present_pte(spte)) 3238 if (!is_shadow_present_pte(spte))
@@ -4510,6 +4519,9 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
4510 u64 spte; 4519 u64 spte;
4511 int nr_sptes = 0; 4520 int nr_sptes = 0;
4512 4521
4522 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
4523 return nr_sptes;
4524
4513 walk_shadow_page_lockless_begin(vcpu); 4525 walk_shadow_page_lockless_begin(vcpu);
4514 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { 4526 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4515 sptes[iterator.level-1] = spte; 4527 sptes[iterator.level-1] = spte;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ad75d77999d0..cba218a2f08d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -569,6 +569,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
569 if (FNAME(gpte_changed)(vcpu, gw, top_level)) 569 if (FNAME(gpte_changed)(vcpu, gw, top_level))
570 goto out_gpte_changed; 570 goto out_gpte_changed;
571 571
572 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
573 goto out_gpte_changed;
574
572 for (shadow_walk_init(&it, vcpu, addr); 575 for (shadow_walk_init(&it, vcpu, addr);
573 shadow_walk_okay(&it) && it.level > gw->level; 576 shadow_walk_okay(&it) && it.level > gw->level;
574 shadow_walk_next(&it)) { 577 shadow_walk_next(&it)) {
@@ -820,6 +823,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
820 */ 823 */
821 mmu_topup_memory_caches(vcpu); 824 mmu_topup_memory_caches(vcpu);
822 825
826 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
827 WARN_ON(1);
828 return;
829 }
830
823 spin_lock(&vcpu->kvm->mmu_lock); 831 spin_lock(&vcpu->kvm->mmu_lock);
824 for_each_shadow_entry(vcpu, gva, iterator) { 832 for_each_shadow_entry(vcpu, gva, iterator) {
825 level = iterator.level; 833 level = iterator.level;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c7168a5cff1b..e81df8fce027 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1671,6 +1671,19 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1671 mark_dirty(svm->vmcb, VMCB_ASID); 1671 mark_dirty(svm->vmcb, VMCB_ASID);
1672} 1672}
1673 1673
1674static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
1675{
1676 return to_svm(vcpu)->vmcb->save.dr6;
1677}
1678
1679static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1680{
1681 struct vcpu_svm *svm = to_svm(vcpu);
1682
1683 svm->vmcb->save.dr6 = value;
1684 mark_dirty(svm->vmcb, VMCB_DR);
1685}
1686
1674static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1687static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1675{ 1688{
1676 struct vcpu_svm *svm = to_svm(vcpu); 1689 struct vcpu_svm *svm = to_svm(vcpu);
@@ -4286,6 +4299,8 @@ static struct kvm_x86_ops svm_x86_ops = {
4286 .set_idt = svm_set_idt, 4299 .set_idt = svm_set_idt,
4287 .get_gdt = svm_get_gdt, 4300 .get_gdt = svm_get_gdt,
4288 .set_gdt = svm_set_gdt, 4301 .set_gdt = svm_set_gdt,
4302 .get_dr6 = svm_get_dr6,
4303 .set_dr6 = svm_set_dr6,
4289 .set_dr7 = svm_set_dr7, 4304 .set_dr7 = svm_set_dr7,
4290 .cache_reg = svm_cache_reg, 4305 .cache_reg = svm_cache_reg,
4291 .get_rflags = svm_get_rflags, 4306 .get_rflags = svm_get_rflags,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da7837e1349d..5c8879127cfa 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -418,6 +418,8 @@ struct vcpu_vmx {
418 u64 msr_host_kernel_gs_base; 418 u64 msr_host_kernel_gs_base;
419 u64 msr_guest_kernel_gs_base; 419 u64 msr_guest_kernel_gs_base;
420#endif 420#endif
421 u32 vm_entry_controls_shadow;
422 u32 vm_exit_controls_shadow;
421 /* 423 /*
422 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 424 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
423 * non-nested (L1) guest, it always points to vmcs01. For a nested 425 * non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -1056,7 +1058,9 @@ static inline bool is_exception(u32 intr_info)
1056 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); 1058 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
1057} 1059}
1058 1060
1059static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); 1061static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1062 u32 exit_intr_info,
1063 unsigned long exit_qualification);
1060static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 1064static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1061 struct vmcs12 *vmcs12, 1065 struct vmcs12 *vmcs12,
1062 u32 reason, unsigned long qualification); 1066 u32 reason, unsigned long qualification);
@@ -1326,6 +1330,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
1326 vmcs_writel(field, vmcs_readl(field) | mask); 1330 vmcs_writel(field, vmcs_readl(field) | mask);
1327} 1331}
1328 1332
1333static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1334{
1335 vmcs_write32(VM_ENTRY_CONTROLS, val);
1336 vmx->vm_entry_controls_shadow = val;
1337}
1338
1339static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1340{
1341 if (vmx->vm_entry_controls_shadow != val)
1342 vm_entry_controls_init(vmx, val);
1343}
1344
1345static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1346{
1347 return vmx->vm_entry_controls_shadow;
1348}
1349
1350
1351static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1352{
1353 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1354}
1355
1356static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1357{
1358 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1359}
1360
1361static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1362{
1363 vmcs_write32(VM_EXIT_CONTROLS, val);
1364 vmx->vm_exit_controls_shadow = val;
1365}
1366
1367static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1368{
1369 if (vmx->vm_exit_controls_shadow != val)
1370 vm_exit_controls_init(vmx, val);
1371}
1372
1373static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1374{
1375 return vmx->vm_exit_controls_shadow;
1376}
1377
1378
1379static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1380{
1381 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1382}
1383
1384static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1385{
1386 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1387}
1388
1329static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 1389static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1330{ 1390{
1331 vmx->segment_cache.bitmask = 0; 1391 vmx->segment_cache.bitmask = 0;
@@ -1410,11 +1470,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1410 vmcs_write32(EXCEPTION_BITMAP, eb); 1470 vmcs_write32(EXCEPTION_BITMAP, eb);
1411} 1471}
1412 1472
1413static void clear_atomic_switch_msr_special(unsigned long entry, 1473static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1414 unsigned long exit) 1474 unsigned long entry, unsigned long exit)
1415{ 1475{
1416 vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); 1476 vm_entry_controls_clearbit(vmx, entry);
1417 vmcs_clear_bits(VM_EXIT_CONTROLS, exit); 1477 vm_exit_controls_clearbit(vmx, exit);
1418} 1478}
1419 1479
1420static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1480static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
@@ -1425,14 +1485,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1425 switch (msr) { 1485 switch (msr) {
1426 case MSR_EFER: 1486 case MSR_EFER:
1427 if (cpu_has_load_ia32_efer) { 1487 if (cpu_has_load_ia32_efer) {
1428 clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, 1488 clear_atomic_switch_msr_special(vmx,
1489 VM_ENTRY_LOAD_IA32_EFER,
1429 VM_EXIT_LOAD_IA32_EFER); 1490 VM_EXIT_LOAD_IA32_EFER);
1430 return; 1491 return;
1431 } 1492 }
1432 break; 1493 break;
1433 case MSR_CORE_PERF_GLOBAL_CTRL: 1494 case MSR_CORE_PERF_GLOBAL_CTRL:
1434 if (cpu_has_load_perf_global_ctrl) { 1495 if (cpu_has_load_perf_global_ctrl) {
1435 clear_atomic_switch_msr_special( 1496 clear_atomic_switch_msr_special(vmx,
1436 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1497 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1437 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1498 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1438 return; 1499 return;
@@ -1453,14 +1514,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1453 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1514 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1454} 1515}
1455 1516
1456static void add_atomic_switch_msr_special(unsigned long entry, 1517static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1457 unsigned long exit, unsigned long guest_val_vmcs, 1518 unsigned long entry, unsigned long exit,
1458 unsigned long host_val_vmcs, u64 guest_val, u64 host_val) 1519 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1520 u64 guest_val, u64 host_val)
1459{ 1521{
1460 vmcs_write64(guest_val_vmcs, guest_val); 1522 vmcs_write64(guest_val_vmcs, guest_val);
1461 vmcs_write64(host_val_vmcs, host_val); 1523 vmcs_write64(host_val_vmcs, host_val);
1462 vmcs_set_bits(VM_ENTRY_CONTROLS, entry); 1524 vm_entry_controls_setbit(vmx, entry);
1463 vmcs_set_bits(VM_EXIT_CONTROLS, exit); 1525 vm_exit_controls_setbit(vmx, exit);
1464} 1526}
1465 1527
1466static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1528static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
@@ -1472,7 +1534,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1472 switch (msr) { 1534 switch (msr) {
1473 case MSR_EFER: 1535 case MSR_EFER:
1474 if (cpu_has_load_ia32_efer) { 1536 if (cpu_has_load_ia32_efer) {
1475 add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, 1537 add_atomic_switch_msr_special(vmx,
1538 VM_ENTRY_LOAD_IA32_EFER,
1476 VM_EXIT_LOAD_IA32_EFER, 1539 VM_EXIT_LOAD_IA32_EFER,
1477 GUEST_IA32_EFER, 1540 GUEST_IA32_EFER,
1478 HOST_IA32_EFER, 1541 HOST_IA32_EFER,
@@ -1482,7 +1545,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1482 break; 1545 break;
1483 case MSR_CORE_PERF_GLOBAL_CTRL: 1546 case MSR_CORE_PERF_GLOBAL_CTRL:
1484 if (cpu_has_load_perf_global_ctrl) { 1547 if (cpu_has_load_perf_global_ctrl) {
1485 add_atomic_switch_msr_special( 1548 add_atomic_switch_msr_special(vmx,
1486 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1549 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1487 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1550 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1488 GUEST_IA32_PERF_GLOBAL_CTRL, 1551 GUEST_IA32_PERF_GLOBAL_CTRL,
@@ -1906,7 +1969,9 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
1906 if (!(vmcs12->exception_bitmap & (1u << nr))) 1969 if (!(vmcs12->exception_bitmap & (1u << nr)))
1907 return 0; 1970 return 0;
1908 1971
1909 nested_vmx_vmexit(vcpu); 1972 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
1973 vmcs_read32(VM_EXIT_INTR_INFO),
1974 vmcs_readl(EXIT_QUALIFICATION));
1910 return 1; 1975 return 1;
1911} 1976}
1912 1977
@@ -2279,6 +2344,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2279 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2344 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2280 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | 2345 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
2281 VMX_MISC_SAVE_EFER_LMA; 2346 VMX_MISC_SAVE_EFER_LMA;
2347 nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT;
2282 nested_vmx_misc_high = 0; 2348 nested_vmx_misc_high = 0;
2283} 2349}
2284 2350
@@ -2295,32 +2361,10 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
2295 return low | ((u64)high << 32); 2361 return low | ((u64)high << 32);
2296} 2362}
2297 2363
2298/* 2364/* Returns 0 on success, non-0 otherwise. */
2299 * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
2300 * also let it use VMX-specific MSRs.
2301 * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
2302 * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
2303 * like all other MSRs).
2304 */
2305static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2365static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2306{ 2366{
2307 if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
2308 msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
2309 /*
2310 * According to the spec, processors which do not support VMX
2311 * should throw a #GP(0) when VMX capability MSRs are read.
2312 */
2313 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
2314 return 1;
2315 }
2316
2317 switch (msr_index) { 2367 switch (msr_index) {
2318 case MSR_IA32_FEATURE_CONTROL:
2319 if (nested_vmx_allowed(vcpu)) {
2320 *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2321 break;
2322 }
2323 return 0;
2324 case MSR_IA32_VMX_BASIC: 2368 case MSR_IA32_VMX_BASIC:
2325 /* 2369 /*
2326 * This MSR reports some information about VMX support. We 2370 * This MSR reports some information about VMX support. We
@@ -2387,34 +2431,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2387 *pdata = nested_vmx_ept_caps; 2431 *pdata = nested_vmx_ept_caps;
2388 break; 2432 break;
2389 default: 2433 default:
2390 return 0;
2391 }
2392
2393 return 1;
2394}
2395
2396static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2397{
2398 u32 msr_index = msr_info->index;
2399 u64 data = msr_info->data;
2400 bool host_initialized = msr_info->host_initiated;
2401
2402 if (!nested_vmx_allowed(vcpu))
2403 return 0;
2404
2405 if (msr_index == MSR_IA32_FEATURE_CONTROL) {
2406 if (!host_initialized &&
2407 to_vmx(vcpu)->nested.msr_ia32_feature_control
2408 & FEATURE_CONTROL_LOCKED)
2409 return 0;
2410 to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
2411 return 1; 2434 return 1;
2412 } 2435 }
2413 2436
2414 /*
2415 * No need to treat VMX capability MSRs specially: If we don't handle
2416 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
2417 */
2418 return 0; 2437 return 0;
2419} 2438}
2420 2439
@@ -2460,13 +2479,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2460 case MSR_IA32_SYSENTER_ESP: 2479 case MSR_IA32_SYSENTER_ESP:
2461 data = vmcs_readl(GUEST_SYSENTER_ESP); 2480 data = vmcs_readl(GUEST_SYSENTER_ESP);
2462 break; 2481 break;
2482 case MSR_IA32_FEATURE_CONTROL:
2483 if (!nested_vmx_allowed(vcpu))
2484 return 1;
2485 data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2486 break;
2487 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2488 if (!nested_vmx_allowed(vcpu))
2489 return 1;
2490 return vmx_get_vmx_msr(vcpu, msr_index, pdata);
2463 case MSR_TSC_AUX: 2491 case MSR_TSC_AUX:
2464 if (!to_vmx(vcpu)->rdtscp_enabled) 2492 if (!to_vmx(vcpu)->rdtscp_enabled)
2465 return 1; 2493 return 1;
2466 /* Otherwise falls through */ 2494 /* Otherwise falls through */
2467 default: 2495 default:
2468 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2469 return 0;
2470 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2496 msr = find_msr_entry(to_vmx(vcpu), msr_index);
2471 if (msr) { 2497 if (msr) {
2472 data = msr->data; 2498 data = msr->data;
@@ -2479,6 +2505,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2479 return 0; 2505 return 0;
2480} 2506}
2481 2507
2508static void vmx_leave_nested(struct kvm_vcpu *vcpu);
2509
2482/* 2510/*
2483 * Writes msr value into into the appropriate "register". 2511 * Writes msr value into into the appropriate "register".
2484 * Returns 0 on success, non-0 otherwise. 2512 * Returns 0 on success, non-0 otherwise.
@@ -2533,6 +2561,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2533 case MSR_IA32_TSC_ADJUST: 2561 case MSR_IA32_TSC_ADJUST:
2534 ret = kvm_set_msr_common(vcpu, msr_info); 2562 ret = kvm_set_msr_common(vcpu, msr_info);
2535 break; 2563 break;
2564 case MSR_IA32_FEATURE_CONTROL:
2565 if (!nested_vmx_allowed(vcpu) ||
2566 (to_vmx(vcpu)->nested.msr_ia32_feature_control &
2567 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2568 return 1;
2569 vmx->nested.msr_ia32_feature_control = data;
2570 if (msr_info->host_initiated && data == 0)
2571 vmx_leave_nested(vcpu);
2572 break;
2573 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2574 return 1; /* they are read-only */
2536 case MSR_TSC_AUX: 2575 case MSR_TSC_AUX:
2537 if (!vmx->rdtscp_enabled) 2576 if (!vmx->rdtscp_enabled)
2538 return 1; 2577 return 1;
@@ -2541,8 +2580,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2541 return 1; 2580 return 1;
2542 /* Otherwise falls through */ 2581 /* Otherwise falls through */
2543 default: 2582 default:
2544 if (vmx_set_vmx_msr(vcpu, msr_info))
2545 break;
2546 msr = find_msr_entry(vmx, msr_index); 2583 msr = find_msr_entry(vmx, msr_index);
2547 if (msr) { 2584 if (msr) {
2548 msr->data = data; 2585 msr->data = data;
@@ -3182,14 +3219,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3182 vmx_load_host_state(to_vmx(vcpu)); 3219 vmx_load_host_state(to_vmx(vcpu));
3183 vcpu->arch.efer = efer; 3220 vcpu->arch.efer = efer;
3184 if (efer & EFER_LMA) { 3221 if (efer & EFER_LMA) {
3185 vmcs_write32(VM_ENTRY_CONTROLS, 3222 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3186 vmcs_read32(VM_ENTRY_CONTROLS) |
3187 VM_ENTRY_IA32E_MODE);
3188 msr->data = efer; 3223 msr->data = efer;
3189 } else { 3224 } else {
3190 vmcs_write32(VM_ENTRY_CONTROLS, 3225 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3191 vmcs_read32(VM_ENTRY_CONTROLS) &
3192 ~VM_ENTRY_IA32E_MODE);
3193 3226
3194 msr->data = efer & ~EFER_LME; 3227 msr->data = efer & ~EFER_LME;
3195 } 3228 }
@@ -3217,9 +3250,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
3217 3250
3218static void exit_lmode(struct kvm_vcpu *vcpu) 3251static void exit_lmode(struct kvm_vcpu *vcpu)
3219{ 3252{
3220 vmcs_write32(VM_ENTRY_CONTROLS, 3253 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3221 vmcs_read32(VM_ENTRY_CONTROLS)
3222 & ~VM_ENTRY_IA32E_MODE);
3223 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3254 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3224} 3255}
3225 3256
@@ -4346,10 +4377,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4346 ++vmx->nmsrs; 4377 ++vmx->nmsrs;
4347 } 4378 }
4348 4379
4349 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 4380
4381 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
4350 4382
4351 /* 22.2.1, 20.8.1 */ 4383 /* 22.2.1, 20.8.1 */
4352 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 4384 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
4353 4385
4354 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 4386 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
4355 set_cr4_guest_host_mask(vmx); 4387 set_cr4_guest_host_mask(vmx);
@@ -4588,15 +4620,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4588static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4620static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4589{ 4621{
4590 if (is_guest_mode(vcpu)) { 4622 if (is_guest_mode(vcpu)) {
4591 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4592
4593 if (to_vmx(vcpu)->nested.nested_run_pending) 4623 if (to_vmx(vcpu)->nested.nested_run_pending)
4594 return 0; 4624 return 0;
4595 if (nested_exit_on_nmi(vcpu)) { 4625 if (nested_exit_on_nmi(vcpu)) {
4596 nested_vmx_vmexit(vcpu); 4626 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
4597 vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI; 4627 NMI_VECTOR | INTR_TYPE_NMI_INTR |
4598 vmcs12->vm_exit_intr_info = NMI_VECTOR | 4628 INTR_INFO_VALID_MASK, 0);
4599 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK;
4600 /* 4629 /*
4601 * The NMI-triggered VM exit counts as injection: 4630 * The NMI-triggered VM exit counts as injection:
4602 * clear this one and block further NMIs. 4631 * clear this one and block further NMIs.
@@ -4618,15 +4647,11 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4618static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4647static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4619{ 4648{
4620 if (is_guest_mode(vcpu)) { 4649 if (is_guest_mode(vcpu)) {
4621 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4622
4623 if (to_vmx(vcpu)->nested.nested_run_pending) 4650 if (to_vmx(vcpu)->nested.nested_run_pending)
4624 return 0; 4651 return 0;
4625 if (nested_exit_on_intr(vcpu)) { 4652 if (nested_exit_on_intr(vcpu)) {
4626 nested_vmx_vmexit(vcpu); 4653 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4627 vmcs12->vm_exit_reason = 4654 0, 0);
4628 EXIT_REASON_EXTERNAL_INTERRUPT;
4629 vmcs12->vm_exit_intr_info = 0;
4630 /* 4655 /*
4631 * fall through to normal code, but now in L1, not L2 4656 * fall through to normal code, but now in L1, not L2
4632 */ 4657 */
@@ -4812,7 +4837,8 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4812 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4837 dr6 = vmcs_readl(EXIT_QUALIFICATION);
4813 if (!(vcpu->guest_debug & 4838 if (!(vcpu->guest_debug &
4814 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 4839 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4815 vcpu->arch.dr6 = dr6 | DR6_FIXED_1; 4840 vcpu->arch.dr6 &= ~15;
4841 vcpu->arch.dr6 |= dr6;
4816 kvm_queue_exception(vcpu, DB_VECTOR); 4842 kvm_queue_exception(vcpu, DB_VECTOR);
4817 return 1; 4843 return 1;
4818 } 4844 }
@@ -5080,14 +5106,27 @@ static int handle_dr(struct kvm_vcpu *vcpu)
5080 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5106 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5081 if (exit_qualification & TYPE_MOV_FROM_DR) { 5107 if (exit_qualification & TYPE_MOV_FROM_DR) {
5082 unsigned long val; 5108 unsigned long val;
5083 if (!kvm_get_dr(vcpu, dr, &val)) 5109
5084 kvm_register_write(vcpu, reg, val); 5110 if (kvm_get_dr(vcpu, dr, &val))
5111 return 1;
5112 kvm_register_write(vcpu, reg, val);
5085 } else 5113 } else
5086 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); 5114 if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
5115 return 1;
5116
5087 skip_emulated_instruction(vcpu); 5117 skip_emulated_instruction(vcpu);
5088 return 1; 5118 return 1;
5089} 5119}
5090 5120
5121static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
5122{
5123 return vcpu->arch.dr6;
5124}
5125
5126static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
5127{
5128}
5129
5091static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5130static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5092{ 5131{
5093 vmcs_writel(GUEST_DR7, val); 5132 vmcs_writel(GUEST_DR7, val);
@@ -6460,11 +6499,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6460 int size; 6499 int size;
6461 u8 b; 6500 u8 b;
6462 6501
6463 if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
6464 return 1;
6465
6466 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6502 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6467 return 0; 6503 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
6468 6504
6469 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6505 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6470 6506
@@ -6628,6 +6664,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6628 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6664 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6629 u32 exit_reason = vmx->exit_reason; 6665 u32 exit_reason = vmx->exit_reason;
6630 6666
6667 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
6668 vmcs_readl(EXIT_QUALIFICATION),
6669 vmx->idt_vectoring_info,
6670 intr_info,
6671 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6672 KVM_ISA_VMX);
6673
6631 if (vmx->nested.nested_run_pending) 6674 if (vmx->nested.nested_run_pending)
6632 return 0; 6675 return 0;
6633 6676
@@ -6777,7 +6820,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6777 return handle_invalid_guest_state(vcpu); 6820 return handle_invalid_guest_state(vcpu);
6778 6821
6779 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { 6822 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
6780 nested_vmx_vmexit(vcpu); 6823 nested_vmx_vmexit(vcpu, exit_reason,
6824 vmcs_read32(VM_EXIT_INTR_INFO),
6825 vmcs_readl(EXIT_QUALIFICATION));
6781 return 1; 6826 return 1;
6782 } 6827 }
6783 6828
@@ -7332,8 +7377,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
7332 struct vcpu_vmx *vmx = to_vmx(vcpu); 7377 struct vcpu_vmx *vmx = to_vmx(vcpu);
7333 7378
7334 free_vpid(vmx); 7379 free_vpid(vmx);
7335 free_nested(vmx);
7336 free_loaded_vmcs(vmx->loaded_vmcs); 7380 free_loaded_vmcs(vmx->loaded_vmcs);
7381 free_nested(vmx);
7337 kfree(vmx->guest_msrs); 7382 kfree(vmx->guest_msrs);
7338 kvm_vcpu_uninit(vcpu); 7383 kvm_vcpu_uninit(vcpu);
7339 kmem_cache_free(kvm_vcpu_cache, vmx); 7384 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7518,15 +7563,14 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
7518static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 7563static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
7519 struct x86_exception *fault) 7564 struct x86_exception *fault)
7520{ 7565{
7521 struct vmcs12 *vmcs12; 7566 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7522 nested_vmx_vmexit(vcpu); 7567 u32 exit_reason;
7523 vmcs12 = get_vmcs12(vcpu);
7524 7568
7525 if (fault->error_code & PFERR_RSVD_MASK) 7569 if (fault->error_code & PFERR_RSVD_MASK)
7526 vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 7570 exit_reason = EXIT_REASON_EPT_MISCONFIG;
7527 else 7571 else
7528 vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 7572 exit_reason = EXIT_REASON_EPT_VIOLATION;
7529 vmcs12->exit_qualification = vcpu->arch.exit_qualification; 7573 nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);
7530 vmcs12->guest_physical_address = fault->address; 7574 vmcs12->guest_physical_address = fault->address;
7531} 7575}
7532 7576
@@ -7564,7 +7608,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
7564 7608
7565 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ 7609 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
7566 if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) 7610 if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
7567 nested_vmx_vmexit(vcpu); 7611 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
7612 vmcs_read32(VM_EXIT_INTR_INFO),
7613 vmcs_readl(EXIT_QUALIFICATION));
7568 else 7614 else
7569 kvm_inject_page_fault(vcpu, fault); 7615 kvm_inject_page_fault(vcpu, fault);
7570} 7616}
@@ -7706,6 +7752,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7706 else 7752 else
7707 vmcs_write64(APIC_ACCESS_ADDR, 7753 vmcs_write64(APIC_ACCESS_ADDR,
7708 page_to_phys(vmx->nested.apic_access_page)); 7754 page_to_phys(vmx->nested.apic_access_page));
7755 } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
7756 exec_control |=
7757 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7758 vmcs_write64(APIC_ACCESS_ADDR,
7759 page_to_phys(vcpu->kvm->arch.apic_access_page));
7709 } 7760 }
7710 7761
7711 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7762 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
@@ -7759,12 +7810,12 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7759 exit_control = vmcs_config.vmexit_ctrl; 7810 exit_control = vmcs_config.vmexit_ctrl;
7760 if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) 7811 if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
7761 exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; 7812 exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
7762 vmcs_write32(VM_EXIT_CONTROLS, exit_control); 7813 vm_exit_controls_init(vmx, exit_control);
7763 7814
7764 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are 7815 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
7765 * emulated by vmx_set_efer(), below. 7816 * emulated by vmx_set_efer(), below.
7766 */ 7817 */
7767 vmcs_write32(VM_ENTRY_CONTROLS, 7818 vm_entry_controls_init(vmx,
7768 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & 7819 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
7769 ~VM_ENTRY_IA32E_MODE) | 7820 ~VM_ENTRY_IA32E_MODE) |
7770 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 7821 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
@@ -7882,7 +7933,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7882 return 1; 7933 return 1;
7883 } 7934 }
7884 7935
7885 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) { 7936 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
7937 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
7886 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 7938 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
7887 return 1; 7939 return 1;
7888 } 7940 }
@@ -7994,8 +8046,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7994 8046
7995 enter_guest_mode(vcpu); 8047 enter_guest_mode(vcpu);
7996 8048
7997 vmx->nested.nested_run_pending = 1;
7998
7999 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); 8049 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
8000 8050
8001 cpu = get_cpu(); 8051 cpu = get_cpu();
@@ -8011,6 +8061,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8011 8061
8012 prepare_vmcs02(vcpu, vmcs12); 8062 prepare_vmcs02(vcpu, vmcs12);
8013 8063
8064 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
8065 return kvm_emulate_halt(vcpu);
8066
8067 vmx->nested.nested_run_pending = 1;
8068
8014 /* 8069 /*
8015 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 8070 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
8016 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 8071 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
@@ -8110,7 +8165,9 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
8110 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 8165 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
8111 * which already writes to vmcs12 directly. 8166 * which already writes to vmcs12 directly.
8112 */ 8167 */
8113static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 8168static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
8169 u32 exit_reason, u32 exit_intr_info,
8170 unsigned long exit_qualification)
8114{ 8171{
8115 /* update guest state fields: */ 8172 /* update guest state fields: */
8116 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 8173 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
@@ -8162,6 +8219,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8162 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 8219 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
8163 vmcs12->guest_pending_dbg_exceptions = 8220 vmcs12->guest_pending_dbg_exceptions =
8164 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 8221 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
8222 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
8223 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
8224 else
8225 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
8165 8226
8166 if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && 8227 if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
8167 (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) 8228 (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
@@ -8186,7 +8247,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8186 8247
8187 vmcs12->vm_entry_controls = 8248 vmcs12->vm_entry_controls =
8188 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 8249 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
8189 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); 8250 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
8190 8251
8191 /* TODO: These cannot have changed unless we have MSR bitmaps and 8252 /* TODO: These cannot have changed unless we have MSR bitmaps and
8192 * the relevant bit asks not to trap the change */ 8253 * the relevant bit asks not to trap the change */
@@ -8201,10 +8262,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8201 8262
8202 /* update exit information fields: */ 8263 /* update exit information fields: */
8203 8264
8204 vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason; 8265 vmcs12->vm_exit_reason = exit_reason;
8205 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 8266 vmcs12->exit_qualification = exit_qualification;
8206 8267
8207 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8268 vmcs12->vm_exit_intr_info = exit_intr_info;
8208 if ((vmcs12->vm_exit_intr_info & 8269 if ((vmcs12->vm_exit_intr_info &
8209 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == 8270 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
8210 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) 8271 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
@@ -8370,7 +8431,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
8370 * and modify vmcs12 to make it see what it would expect to see there if 8431 * and modify vmcs12 to make it see what it would expect to see there if
8371 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 8432 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
8372 */ 8433 */
8373static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) 8434static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
8435 u32 exit_intr_info,
8436 unsigned long exit_qualification)
8374{ 8437{
8375 struct vcpu_vmx *vmx = to_vmx(vcpu); 8438 struct vcpu_vmx *vmx = to_vmx(vcpu);
8376 int cpu; 8439 int cpu;
@@ -8380,7 +8443,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
8380 WARN_ON_ONCE(vmx->nested.nested_run_pending); 8443 WARN_ON_ONCE(vmx->nested.nested_run_pending);
8381 8444
8382 leave_guest_mode(vcpu); 8445 leave_guest_mode(vcpu);
8383 prepare_vmcs12(vcpu, vmcs12); 8446 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
8447 exit_qualification);
8448
8449 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
8450 vmcs12->exit_qualification,
8451 vmcs12->idt_vectoring_info_field,
8452 vmcs12->vm_exit_intr_info,
8453 vmcs12->vm_exit_intr_error_code,
8454 KVM_ISA_VMX);
8384 8455
8385 cpu = get_cpu(); 8456 cpu = get_cpu();
8386 vmx->loaded_vmcs = &vmx->vmcs01; 8457 vmx->loaded_vmcs = &vmx->vmcs01;
@@ -8389,6 +8460,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
8389 vcpu->cpu = cpu; 8460 vcpu->cpu = cpu;
8390 put_cpu(); 8461 put_cpu();
8391 8462
8463 vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
8464 vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
8392 vmx_segment_cache_clear(vmx); 8465 vmx_segment_cache_clear(vmx);
8393 8466
8394 /* if no vmcs02 cache requested, remove the one we used */ 8467 /* if no vmcs02 cache requested, remove the one we used */
@@ -8424,6 +8497,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
8424} 8497}
8425 8498
8426/* 8499/*
8500 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
8501 */
8502static void vmx_leave_nested(struct kvm_vcpu *vcpu)
8503{
8504 if (is_guest_mode(vcpu))
8505 nested_vmx_vmexit(vcpu, -1, 0, 0);
8506 free_nested(to_vmx(vcpu));
8507}
8508
8509/*
8427 * L1's failure to enter L2 is a subset of a normal exit, as explained in 8510 * L1's failure to enter L2 is a subset of a normal exit, as explained in
8428 * 23.7 "VM-entry failures during or after loading guest state" (this also 8511 * 23.7 "VM-entry failures during or after loading guest state" (this also
8429 * lists the acceptable exit-reason and exit-qualification parameters). 8512 * lists the acceptable exit-reason and exit-qualification parameters).
@@ -8486,6 +8569,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
8486 .set_idt = vmx_set_idt, 8569 .set_idt = vmx_set_idt,
8487 .get_gdt = vmx_get_gdt, 8570 .get_gdt = vmx_get_gdt,
8488 .set_gdt = vmx_set_gdt, 8571 .set_gdt = vmx_set_gdt,
8572 .get_dr6 = vmx_get_dr6,
8573 .set_dr6 = vmx_set_dr6,
8489 .set_dr7 = vmx_set_dr7, 8574 .set_dr7 = vmx_set_dr7,
8490 .cache_reg = vmx_cache_reg, 8575 .cache_reg = vmx_cache_reg,
8491 .get_rflags = vmx_get_rflags, 8576 .get_rflags = vmx_get_rflags,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5d004da1e35d..0c76f7cfdb32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -94,6 +94,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
94static bool ignore_msrs = 0; 94static bool ignore_msrs = 0;
95module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); 95module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
96 96
97unsigned int min_timer_period_us = 500;
98module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
99
97bool kvm_has_tsc_control; 100bool kvm_has_tsc_control;
98EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 101EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
99u32 kvm_max_guest_tsc_khz; 102u32 kvm_max_guest_tsc_khz;
@@ -719,6 +722,12 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
719} 722}
720EXPORT_SYMBOL_GPL(kvm_get_cr8); 723EXPORT_SYMBOL_GPL(kvm_get_cr8);
721 724
725static void kvm_update_dr6(struct kvm_vcpu *vcpu)
726{
727 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
728 kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
729}
730
722static void kvm_update_dr7(struct kvm_vcpu *vcpu) 731static void kvm_update_dr7(struct kvm_vcpu *vcpu)
723{ 732{
724 unsigned long dr7; 733 unsigned long dr7;
@@ -747,6 +756,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
747 if (val & 0xffffffff00000000ULL) 756 if (val & 0xffffffff00000000ULL)
748 return -1; /* #GP */ 757 return -1; /* #GP */
749 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 758 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
759 kvm_update_dr6(vcpu);
750 break; 760 break;
751 case 5: 761 case 5:
752 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 762 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
@@ -788,7 +798,10 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
788 return 1; 798 return 1;
789 /* fall through */ 799 /* fall through */
790 case 6: 800 case 6:
791 *val = vcpu->arch.dr6; 801 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
802 *val = vcpu->arch.dr6;
803 else
804 *val = kvm_x86_ops->get_dr6(vcpu);
792 break; 805 break;
793 case 5: 806 case 5:
794 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 807 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
@@ -836,11 +849,12 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
836 * kvm-specific. Those are put in the beginning of the list. 849 * kvm-specific. Those are put in the beginning of the list.
837 */ 850 */
838 851
839#define KVM_SAVE_MSRS_BEGIN 10 852#define KVM_SAVE_MSRS_BEGIN 12
840static u32 msrs_to_save[] = { 853static u32 msrs_to_save[] = {
841 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 854 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
842 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 855 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
843 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 856 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
857 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
844 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 858 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
845 MSR_KVM_PV_EOI_EN, 859 MSR_KVM_PV_EOI_EN,
846 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 860 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -1275,8 +1289,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1275 kvm->arch.last_tsc_write = data; 1289 kvm->arch.last_tsc_write = data;
1276 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; 1290 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1277 1291
1278 /* Reset of TSC must disable overshoot protection below */
1279 vcpu->arch.hv_clock.tsc_timestamp = 0;
1280 vcpu->arch.last_guest_tsc = data; 1292 vcpu->arch.last_guest_tsc = data;
1281 1293
1282 /* Keep track of which generation this VCPU has synchronized to */ 1294 /* Keep track of which generation this VCPU has synchronized to */
@@ -1484,7 +1496,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1484 unsigned long flags, this_tsc_khz; 1496 unsigned long flags, this_tsc_khz;
1485 struct kvm_vcpu_arch *vcpu = &v->arch; 1497 struct kvm_vcpu_arch *vcpu = &v->arch;
1486 struct kvm_arch *ka = &v->kvm->arch; 1498 struct kvm_arch *ka = &v->kvm->arch;
1487 s64 kernel_ns, max_kernel_ns; 1499 s64 kernel_ns;
1488 u64 tsc_timestamp, host_tsc; 1500 u64 tsc_timestamp, host_tsc;
1489 struct pvclock_vcpu_time_info guest_hv_clock; 1501 struct pvclock_vcpu_time_info guest_hv_clock;
1490 u8 pvclock_flags; 1502 u8 pvclock_flags;
@@ -1543,37 +1555,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1543 if (!vcpu->pv_time_enabled) 1555 if (!vcpu->pv_time_enabled)
1544 return 0; 1556 return 0;
1545 1557
1546 /*
1547 * Time as measured by the TSC may go backwards when resetting the base
1548 * tsc_timestamp. The reason for this is that the TSC resolution is
1549 * higher than the resolution of the other clock scales. Thus, many
1550 * possible measurments of the TSC correspond to one measurement of any
1551 * other clock, and so a spread of values is possible. This is not a
1552 * problem for the computation of the nanosecond clock; with TSC rates
1553 * around 1GHZ, there can only be a few cycles which correspond to one
1554 * nanosecond value, and any path through this code will inevitably
1555 * take longer than that. However, with the kernel_ns value itself,
1556 * the precision may be much lower, down to HZ granularity. If the
1557 * first sampling of TSC against kernel_ns ends in the low part of the
1558 * range, and the second in the high end of the range, we can get:
1559 *
1560 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1561 *
1562 * As the sampling errors potentially range in the thousands of cycles,
1563 * it is possible such a time value has already been observed by the
1564 * guest. To protect against this, we must compute the system time as
1565 * observed by the guest and ensure the new system time is greater.
1566 */
1567 max_kernel_ns = 0;
1568 if (vcpu->hv_clock.tsc_timestamp) {
1569 max_kernel_ns = vcpu->last_guest_tsc -
1570 vcpu->hv_clock.tsc_timestamp;
1571 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1572 vcpu->hv_clock.tsc_to_system_mul,
1573 vcpu->hv_clock.tsc_shift);
1574 max_kernel_ns += vcpu->last_kernel_ns;
1575 }
1576
1577 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { 1558 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1578 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, 1559 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1579 &vcpu->hv_clock.tsc_shift, 1560 &vcpu->hv_clock.tsc_shift,
@@ -1581,14 +1562,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1581 vcpu->hw_tsc_khz = this_tsc_khz; 1562 vcpu->hw_tsc_khz = this_tsc_khz;
1582 } 1563 }
1583 1564
1584 /* with a master <monotonic time, tsc value> tuple,
1585 * pvclock clock reads always increase at the (scaled) rate
1586 * of guest TSC - no need to deal with sampling errors.
1587 */
1588 if (!use_master_clock) {
1589 if (max_kernel_ns > kernel_ns)
1590 kernel_ns = max_kernel_ns;
1591 }
1592 /* With all the info we got, fill in the values */ 1565 /* With all the info we got, fill in the values */
1593 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1566 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1594 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1567 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
@@ -1826,6 +1799,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
1826 switch (msr) { 1799 switch (msr) {
1827 case HV_X64_MSR_GUEST_OS_ID: 1800 case HV_X64_MSR_GUEST_OS_ID:
1828 case HV_X64_MSR_HYPERCALL: 1801 case HV_X64_MSR_HYPERCALL:
1802 case HV_X64_MSR_REFERENCE_TSC:
1803 case HV_X64_MSR_TIME_REF_COUNT:
1829 r = true; 1804 r = true;
1830 break; 1805 break;
1831 } 1806 }
@@ -1867,6 +1842,20 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1867 kvm->arch.hv_hypercall = data; 1842 kvm->arch.hv_hypercall = data;
1868 break; 1843 break;
1869 } 1844 }
1845 case HV_X64_MSR_REFERENCE_TSC: {
1846 u64 gfn;
1847 HV_REFERENCE_TSC_PAGE tsc_ref;
1848 memset(&tsc_ref, 0, sizeof(tsc_ref));
1849 kvm->arch.hv_tsc_page = data;
1850 if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
1851 break;
1852 gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
1853 if (kvm_write_guest(kvm, data,
1854 &tsc_ref, sizeof(tsc_ref)))
1855 return 1;
1856 mark_page_dirty(kvm, gfn);
1857 break;
1858 }
1870 default: 1859 default:
1871 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1860 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1872 "data 0x%llx\n", msr, data); 1861 "data 0x%llx\n", msr, data);
@@ -2291,6 +2280,14 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2291 case HV_X64_MSR_HYPERCALL: 2280 case HV_X64_MSR_HYPERCALL:
2292 data = kvm->arch.hv_hypercall; 2281 data = kvm->arch.hv_hypercall;
2293 break; 2282 break;
2283 case HV_X64_MSR_TIME_REF_COUNT: {
2284 data =
2285 div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
2286 break;
2287 }
2288 case HV_X64_MSR_REFERENCE_TSC:
2289 data = kvm->arch.hv_tsc_page;
2290 break;
2294 default: 2291 default:
2295 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 2292 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2296 return 1; 2293 return 1;
@@ -2604,6 +2601,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2604#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2601#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2605 case KVM_CAP_ASSIGN_DEV_IRQ: 2602 case KVM_CAP_ASSIGN_DEV_IRQ:
2606 case KVM_CAP_PCI_2_3: 2603 case KVM_CAP_PCI_2_3:
2604 case KVM_CAP_HYPERV_TIME:
2607#endif 2605#endif
2608 r = 1; 2606 r = 1;
2609 break; 2607 break;
@@ -2972,8 +2970,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2972static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2970static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2973 struct kvm_debugregs *dbgregs) 2971 struct kvm_debugregs *dbgregs)
2974{ 2972{
2973 unsigned long val;
2974
2975 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2975 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2976 dbgregs->dr6 = vcpu->arch.dr6; 2976 _kvm_get_dr(vcpu, 6, &val);
2977 dbgregs->dr6 = val;
2977 dbgregs->dr7 = vcpu->arch.dr7; 2978 dbgregs->dr7 = vcpu->arch.dr7;
2978 dbgregs->flags = 0; 2979 dbgregs->flags = 0;
2979 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); 2980 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
@@ -2987,7 +2988,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2987 2988
2988 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2989 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2989 vcpu->arch.dr6 = dbgregs->dr6; 2990 vcpu->arch.dr6 = dbgregs->dr6;
2991 kvm_update_dr6(vcpu);
2990 vcpu->arch.dr7 = dbgregs->dr7; 2992 vcpu->arch.dr7 = dbgregs->dr7;
2993 kvm_update_dr7(vcpu);
2991 2994
2992 return 0; 2995 return 0;
2993} 2996}
@@ -5834,6 +5837,11 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5834 kvm_apic_update_tmr(vcpu, tmr); 5837 kvm_apic_update_tmr(vcpu, tmr);
5835} 5838}
5836 5839
5840/*
5841 * Returns 1 to let __vcpu_run() continue the guest execution loop without
5842 * exiting to the userspace. Otherwise, the value will be returned to the
5843 * userspace.
5844 */
5837static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5845static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5838{ 5846{
5839 int r; 5847 int r;
@@ -6089,7 +6097,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
6089 } 6097 }
6090 if (need_resched()) { 6098 if (need_resched()) {
6091 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6099 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6092 kvm_resched(vcpu); 6100 cond_resched();
6093 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6101 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6094 } 6102 }
6095 } 6103 }
@@ -6717,6 +6725,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6717 6725
6718 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 6726 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6719 vcpu->arch.dr6 = DR6_FIXED_1; 6727 vcpu->arch.dr6 = DR6_FIXED_1;
6728 kvm_update_dr6(vcpu);
6720 vcpu->arch.dr7 = DR7_FIXED_1; 6729 vcpu->arch.dr7 = DR7_FIXED_1;
6721 kvm_update_dr7(vcpu); 6730 kvm_update_dr7(vcpu);
6722 6731
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 587fb9ede436..8da5823bcde6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -125,5 +125,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
125#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) 125#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
126extern u64 host_xcr0; 126extern u64 host_xcr0;
127 127
128extern unsigned int min_timer_period_us;
129
128extern struct static_key kvm_no_apic_vcpu; 130extern struct static_key kvm_no_apic_vcpu;
129#endif 131#endif
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a30ca15be21c..dee945d55594 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -186,7 +186,7 @@ ENTRY(copy_user_generic_unrolled)
18630: shll $6,%ecx 18630: shll $6,%ecx
187 addl %ecx,%edx 187 addl %ecx,%edx
188 jmp 60f 188 jmp 60f
18940: lea (%rdx,%rcx,8),%rdx 18940: leal (%rdx,%rcx,8),%edx
190 jmp 60f 190 jmp 60f
19150: movl %ecx,%edx 19150: movl %ecx,%edx
19260: jmp copy_user_handle_tail /* ecx is zerorest also */ 19260: jmp copy_user_handle_tail /* ecx is zerorest also */
@@ -236,8 +236,6 @@ ENDPROC(copy_user_generic_unrolled)
236ENTRY(copy_user_generic_string) 236ENTRY(copy_user_generic_string)
237 CFI_STARTPROC 237 CFI_STARTPROC
238 ASM_STAC 238 ASM_STAC
239 andl %edx,%edx
240 jz 4f
241 cmpl $8,%edx 239 cmpl $8,%edx
242 jb 2f /* less than 8 bytes, go to byte copy loop */ 240 jb 2f /* less than 8 bytes, go to byte copy loop */
243 ALIGN_DESTINATION 241 ALIGN_DESTINATION
@@ -249,12 +247,12 @@ ENTRY(copy_user_generic_string)
2492: movl %edx,%ecx 2472: movl %edx,%ecx
2503: rep 2483: rep
251 movsb 249 movsb
2524: xorl %eax,%eax 250 xorl %eax,%eax
253 ASM_CLAC 251 ASM_CLAC
254 ret 252 ret
255 253
256 .section .fixup,"ax" 254 .section .fixup,"ax"
25711: lea (%rdx,%rcx,8),%rcx 25511: leal (%rdx,%rcx,8),%ecx
25812: movl %ecx,%edx /* ecx is zerorest also */ 25612: movl %ecx,%edx /* ecx is zerorest also */
259 jmp copy_user_handle_tail 257 jmp copy_user_handle_tail
260 .previous 258 .previous
@@ -279,12 +277,10 @@ ENDPROC(copy_user_generic_string)
279ENTRY(copy_user_enhanced_fast_string) 277ENTRY(copy_user_enhanced_fast_string)
280 CFI_STARTPROC 278 CFI_STARTPROC
281 ASM_STAC 279 ASM_STAC
282 andl %edx,%edx
283 jz 2f
284 movl %edx,%ecx 280 movl %edx,%ecx
2851: rep 2811: rep
286 movsb 282 movsb
2872: xorl %eax,%eax 283 xorl %eax,%eax
288 ASM_CLAC 284 ASM_CLAC
289 ret 285 ret
290 286
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 7c3bee636e2f..39d6a3db0b96 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -16,7 +16,6 @@
16#include <linux/timex.h> 16#include <linux/timex.h>
17#include <linux/preempt.h> 17#include <linux/preempt.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/init.h>
20 19
21#include <asm/processor.h> 20#include <asm/processor.h>
22#include <asm/delay.h> 21#include <asm/delay.h>
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 533a85e3a07e..1a2be7c6895d 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -346,8 +346,8 @@ AVXcode: 1
34617: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 34617: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
34718: Grp16 (1A) 34718: Grp16 (1A)
34819: 34819:
3491a: 3491a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv
3501b: 3501b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv
3511c: 3511c:
3521d: 3521d:
3531e: 3531e:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9ff85bb8dd69..9d591c895803 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -641,6 +641,20 @@ no_context(struct pt_regs *regs, unsigned long error_code,
641 641
642 /* Are we prepared to handle this kernel fault? */ 642 /* Are we prepared to handle this kernel fault? */
643 if (fixup_exception(regs)) { 643 if (fixup_exception(regs)) {
644 /*
645 * Any interrupt that takes a fault gets the fixup. This makes
646 * the below recursive fault logic only apply to a faults from
647 * task context.
648 */
649 if (in_interrupt())
650 return;
651
652 /*
653 * Per the above we're !in_interrupt(), aka. task context.
654 *
655 * In this case we need to make sure we're not recursively
656 * faulting through the emulate_vsyscall() logic.
657 */
644 if (current_thread_info()->sig_on_uaccess_error && signal) { 658 if (current_thread_info()->sig_on_uaccess_error && signal) {
645 tsk->thread.trap_nr = X86_TRAP_PF; 659 tsk->thread.trap_nr = X86_TRAP_PF;
646 tsk->thread.error_code = error_code | PF_USER; 660 tsk->thread.error_code = error_code | PF_USER;
@@ -649,6 +663,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,
649 /* XXX: hwpoison faults will set the wrong code. */ 663 /* XXX: hwpoison faults will set the wrong code. */
650 force_sig_info_fault(signal, si_code, address, tsk, 0); 664 force_sig_info_fault(signal, si_code, address, tsk, 0);
651 } 665 }
666
667 /*
668 * Barring that, we can do the fixup and be happy.
669 */
652 return; 670 return;
653 } 671 }
654 672
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 0596e8e0cc19..207d9aef662d 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -108,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
108 108
109static inline void get_head_page_multiple(struct page *page, int nr) 109static inline void get_head_page_multiple(struct page *page, int nr)
110{ 110{
111 VM_BUG_ON(page != compound_head(page)); 111 VM_BUG_ON_PAGE(page != compound_head(page), page);
112 VM_BUG_ON(page_count(page) == 0); 112 VM_BUG_ON_PAGE(page_count(page) == 0, page);
113 atomic_add(nr, &page->_count); 113 atomic_add(nr, &page->_count);
114 SetPageReferenced(page); 114 SetPageReferenced(page);
115} 115}
@@ -135,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
135 head = pte_page(pte); 135 head = pte_page(pte);
136 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 136 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
137 do { 137 do {
138 VM_BUG_ON(compound_head(page) != head); 138 VM_BUG_ON_PAGE(compound_head(page) != head, page);
139 pages[*nr] = page; 139 pages[*nr] = page;
140 if (PageTail(page)) 140 if (PageTail(page))
141 get_huge_page_tail(page); 141 get_huge_page_tail(page);
@@ -212,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
212 head = pte_page(pte); 212 head = pte_page(pte);
213 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 213 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
214 do { 214 do {
215 VM_BUG_ON(compound_head(page) != head); 215 VM_BUG_ON_PAGE(compound_head(page) != head, page);
216 pages[*nr] = page; 216 pages[*nr] = page;
217 if (PageTail(page)) 217 if (PageTail(page))
218 get_huge_page_tail(page); 218 get_huge_page_tail(page);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 9d980d88b747..8c9f647ff9e1 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -87,9 +87,7 @@ int pmd_huge_support(void)
87} 87}
88#endif 88#endif
89 89
90/* x86_64 also uses this file */ 90#ifdef CONFIG_HUGETLB_PAGE
91
92#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
93static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, 91static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
94 unsigned long addr, unsigned long len, 92 unsigned long addr, unsigned long len,
95 unsigned long pgoff, unsigned long flags) 93 unsigned long pgoff, unsigned long flags)
@@ -99,7 +97,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
99 97
100 info.flags = 0; 98 info.flags = 0;
101 info.length = len; 99 info.length = len;
102 info.low_limit = TASK_UNMAPPED_BASE; 100 info.low_limit = current->mm->mmap_legacy_base;
103 info.high_limit = TASK_SIZE; 101 info.high_limit = TASK_SIZE;
104 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 102 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
105 info.align_offset = 0; 103 info.align_offset = 0;
@@ -172,8 +170,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
172 return hugetlb_get_unmapped_area_topdown(file, addr, len, 170 return hugetlb_get_unmapped_area_topdown(file, addr, len,
173 pgoff, flags); 171 pgoff, flags);
174} 172}
175 173#endif /* CONFIG_HUGETLB_PAGE */
176#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
177 174
178#ifdef CONFIG_X86_64 175#ifdef CONFIG_X86_64
179static __init int setup_hugepagesz(char *opt) 176static __init int setup_hugepagesz(char *opt)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4287f1ffba7e..e39504878aec 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -665,7 +665,7 @@ void __init initmem_init(void)
665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
666#endif 666#endif
667 667
668 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 668 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
669 sparse_memory_present_with_active_regions(0); 669 sparse_memory_present_with_active_regions(0);
670 670
671#ifdef CONFIG_FLATMEM 671#ifdef CONFIG_FLATMEM
@@ -806,6 +806,9 @@ void __init mem_init(void)
806 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); 806 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
807#undef high_memory 807#undef high_memory
808#undef __FIXADDR_TOP 808#undef __FIXADDR_TOP
809#ifdef CONFIG_RANDOMIZE_BASE
810 BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
811#endif
809 812
810#ifdef CONFIG_HIGHMEM 813#ifdef CONFIG_HIGHMEM
811 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 814 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 104d56a9245f..f35c66c5959a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start,
643#ifndef CONFIG_NUMA 643#ifndef CONFIG_NUMA
644void __init initmem_init(void) 644void __init initmem_init(void)
645{ 645{
646 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 646 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
647} 647}
648#endif 648#endif
649 649
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index e5d5e2ce9f77..637ab34ed632 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -11,7 +11,6 @@
11#include <linux/rculist.h> 11#include <linux/rculist.h>
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/hash.h> 13#include <linux/hash.h>
14#include <linux/init.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/kernel.h> 15#include <linux/kernel.h>
17#include <linux/uaccess.h> 16#include <linux/uaccess.h>
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 8dabbed409ee..1e9da795767a 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
74 u64 i; 74 u64 i;
75 phys_addr_t this_start, this_end; 75 phys_addr_t this_start, this_end;
76 76
77 for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { 77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
78 this_start = clamp_t(phys_addr_t, this_start, start, end); 78 this_start = clamp_t(phys_addr_t, this_start, start, end);
79 this_end = clamp_t(phys_addr_t, this_end, start, end); 79 this_end = clamp_t(phys_addr_t, this_end, start, end);
80 if (this_start < this_end) { 80 if (this_start < this_end) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 24aec58d6afd..81b2750f3666 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -211,9 +211,13 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
211 */ 211 */
212 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); 212 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
213 if (!nd_pa) { 213 if (!nd_pa) {
214 pr_err("Cannot find %zu bytes in node %d\n", 214 nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
215 nd_size, nid); 215 MEMBLOCK_ALLOC_ACCESSIBLE);
216 return; 216 if (!nd_pa) {
217 pr_err("Cannot find %zu bytes in node %d\n",
218 nd_size, nid);
219 return;
220 }
217 } 221 }
218 nd = __va(nd_pa); 222 nd = __va(nd_pa);
219 223
@@ -487,7 +491,16 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
487 491
488 for (i = 0; i < mi->nr_blks; i++) { 492 for (i = 0; i < mi->nr_blks; i++) {
489 struct numa_memblk *mb = &mi->blk[i]; 493 struct numa_memblk *mb = &mi->blk[i];
490 memblock_set_node(mb->start, mb->end - mb->start, mb->nid); 494 memblock_set_node(mb->start, mb->end - mb->start,
495 &memblock.memory, mb->nid);
496
497 /*
498 * At this time, all memory regions reserved by memblock are
499 * used by the kernel. Set the nid in memblock.reserved will
500 * mark out all the nodes the kernel resides in.
501 */
502 memblock_set_node(mb->start, mb->end - mb->start,
503 &memblock.reserved, mb->nid);
491 } 504 }
492 505
493 /* 506 /*
@@ -549,6 +562,30 @@ static void __init numa_init_array(void)
549 } 562 }
550} 563}
551 564
565static void __init numa_clear_kernel_node_hotplug(void)
566{
567 int i, nid;
568 nodemask_t numa_kernel_nodes;
569 unsigned long start, end;
570 struct memblock_type *type = &memblock.reserved;
571
572 /* Mark all kernel nodes. */
573 for (i = 0; i < type->cnt; i++)
574 node_set(type->regions[i].nid, numa_kernel_nodes);
575
576 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
577 for (i = 0; i < numa_meminfo.nr_blks; i++) {
578 nid = numa_meminfo.blk[i].nid;
579 if (!node_isset(nid, numa_kernel_nodes))
580 continue;
581
582 start = numa_meminfo.blk[i].start;
583 end = numa_meminfo.blk[i].end;
584
585 memblock_clear_hotplug(start, end - start);
586 }
587}
588
552static int __init numa_init(int (*init_func)(void)) 589static int __init numa_init(int (*init_func)(void))
553{ 590{
554 int i; 591 int i;
@@ -561,7 +598,12 @@ static int __init numa_init(int (*init_func)(void))
561 nodes_clear(node_possible_map); 598 nodes_clear(node_possible_map);
562 nodes_clear(node_online_map); 599 nodes_clear(node_online_map);
563 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 600 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
564 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); 601 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
602 MAX_NUMNODES));
603 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
604 MAX_NUMNODES));
605 /* In case that parsing SRAT failed. */
606 WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
565 numa_reset_distance(); 607 numa_reset_distance();
566 608
567 ret = init_func(); 609 ret = init_func();
@@ -597,6 +639,16 @@ static int __init numa_init(int (*init_func)(void))
597 numa_clear_node(i); 639 numa_clear_node(i);
598 } 640 }
599 numa_init_array(); 641 numa_init_array();
642
643 /*
644 * At very early time, the kernel have to use some memory such as
645 * loading the kernel image. We cannot prevent this anyway. So any
646 * node the kernel resides in should be un-hotpluggable.
647 *
648 * And when we come here, numa_init() won't fail.
649 */
650 numa_clear_kernel_node_hotplug();
651
600 return 0; 652 return 0;
601} 653}
602 654
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d0b1773d9d2e..461bc8289024 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -8,7 +8,6 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/random.h> 9#include <linux/random.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/mm.h> 11#include <linux/mm.h>
13 12
14#include <asm/cacheflush.h> 13#include <asm/cacheflush.h>
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bb32480c2d71..b3b19f46c016 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -30,6 +30,7 @@
30 */ 30 */
31struct cpa_data { 31struct cpa_data {
32 unsigned long *vaddr; 32 unsigned long *vaddr;
33 pgd_t *pgd;
33 pgprot_t mask_set; 34 pgprot_t mask_set;
34 pgprot_t mask_clr; 35 pgprot_t mask_clr;
35 int numpages; 36 int numpages;
@@ -322,17 +323,9 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
322 return prot; 323 return prot;
323} 324}
324 325
325/* 326static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
326 * Lookup the page table entry for a virtual address. Return a pointer 327 unsigned int *level)
327 * to the entry and the level of the mapping.
328 *
329 * Note: We return pud and pmd either when the entry is marked large
330 * or when the present bit is not set. Otherwise we would return a
331 * pointer to a nonexisting mapping.
332 */
333pte_t *lookup_address(unsigned long address, unsigned int *level)
334{ 328{
335 pgd_t *pgd = pgd_offset_k(address);
336 pud_t *pud; 329 pud_t *pud;
337 pmd_t *pmd; 330 pmd_t *pmd;
338 331
@@ -361,8 +354,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
361 354
362 return pte_offset_kernel(pmd, address); 355 return pte_offset_kernel(pmd, address);
363} 356}
357
358/*
359 * Lookup the page table entry for a virtual address. Return a pointer
360 * to the entry and the level of the mapping.
361 *
362 * Note: We return pud and pmd either when the entry is marked large
363 * or when the present bit is not set. Otherwise we would return a
364 * pointer to a nonexisting mapping.
365 */
366pte_t *lookup_address(unsigned long address, unsigned int *level)
367{
368 return __lookup_address_in_pgd(pgd_offset_k(address), address, level);
369}
364EXPORT_SYMBOL_GPL(lookup_address); 370EXPORT_SYMBOL_GPL(lookup_address);
365 371
372static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
373 unsigned int *level)
374{
375 if (cpa->pgd)
376 return __lookup_address_in_pgd(cpa->pgd + pgd_index(address),
377 address, level);
378
379 return lookup_address(address, level);
380}
381
366/* 382/*
367 * This is necessary because __pa() does not work on some 383 * This is necessary because __pa() does not work on some
368 * kinds of memory, like vmalloc() or the alloc_remap() 384 * kinds of memory, like vmalloc() or the alloc_remap()
@@ -437,7 +453,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
437 * Check for races, another CPU might have split this page 453 * Check for races, another CPU might have split this page
438 * up already: 454 * up already:
439 */ 455 */
440 tmp = lookup_address(address, &level); 456 tmp = _lookup_address_cpa(cpa, address, &level);
441 if (tmp != kpte) 457 if (tmp != kpte)
442 goto out_unlock; 458 goto out_unlock;
443 459
@@ -543,7 +559,8 @@ out_unlock:
543} 559}
544 560
545static int 561static int
546__split_large_page(pte_t *kpte, unsigned long address, struct page *base) 562__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
563 struct page *base)
547{ 564{
548 pte_t *pbase = (pte_t *)page_address(base); 565 pte_t *pbase = (pte_t *)page_address(base);
549 unsigned long pfn, pfninc = 1; 566 unsigned long pfn, pfninc = 1;
@@ -556,7 +573,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
556 * Check for races, another CPU might have split this page 573 * Check for races, another CPU might have split this page
557 * up for us already: 574 * up for us already:
558 */ 575 */
559 tmp = lookup_address(address, &level); 576 tmp = _lookup_address_cpa(cpa, address, &level);
560 if (tmp != kpte) { 577 if (tmp != kpte) {
561 spin_unlock(&pgd_lock); 578 spin_unlock(&pgd_lock);
562 return 1; 579 return 1;
@@ -632,7 +649,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
632 return 0; 649 return 0;
633} 650}
634 651
635static int split_large_page(pte_t *kpte, unsigned long address) 652static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
653 unsigned long address)
636{ 654{
637 struct page *base; 655 struct page *base;
638 656
@@ -644,15 +662,390 @@ static int split_large_page(pte_t *kpte, unsigned long address)
644 if (!base) 662 if (!base)
645 return -ENOMEM; 663 return -ENOMEM;
646 664
647 if (__split_large_page(kpte, address, base)) 665 if (__split_large_page(cpa, kpte, address, base))
648 __free_page(base); 666 __free_page(base);
649 667
650 return 0; 668 return 0;
651} 669}
652 670
671static bool try_to_free_pte_page(pte_t *pte)
672{
673 int i;
674
675 for (i = 0; i < PTRS_PER_PTE; i++)
676 if (!pte_none(pte[i]))
677 return false;
678
679 free_page((unsigned long)pte);
680 return true;
681}
682
683static bool try_to_free_pmd_page(pmd_t *pmd)
684{
685 int i;
686
687 for (i = 0; i < PTRS_PER_PMD; i++)
688 if (!pmd_none(pmd[i]))
689 return false;
690
691 free_page((unsigned long)pmd);
692 return true;
693}
694
695static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
696{
697 pte_t *pte = pte_offset_kernel(pmd, start);
698
699 while (start < end) {
700 set_pte(pte, __pte(0));
701
702 start += PAGE_SIZE;
703 pte++;
704 }
705
706 if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
707 pmd_clear(pmd);
708 return true;
709 }
710 return false;
711}
712
713static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
714 unsigned long start, unsigned long end)
715{
716 if (unmap_pte_range(pmd, start, end))
717 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
718 pud_clear(pud);
719}
720
721static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
722{
723 pmd_t *pmd = pmd_offset(pud, start);
724
725 /*
726 * Not on a 2MB page boundary?
727 */
728 if (start & (PMD_SIZE - 1)) {
729 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
730 unsigned long pre_end = min_t(unsigned long, end, next_page);
731
732 __unmap_pmd_range(pud, pmd, start, pre_end);
733
734 start = pre_end;
735 pmd++;
736 }
737
738 /*
739 * Try to unmap in 2M chunks.
740 */
741 while (end - start >= PMD_SIZE) {
742 if (pmd_large(*pmd))
743 pmd_clear(pmd);
744 else
745 __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
746
747 start += PMD_SIZE;
748 pmd++;
749 }
750
751 /*
752 * 4K leftovers?
753 */
754 if (start < end)
755 return __unmap_pmd_range(pud, pmd, start, end);
756
757 /*
758 * Try again to free the PMD page if haven't succeeded above.
759 */
760 if (!pud_none(*pud))
761 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
762 pud_clear(pud);
763}
764
765static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
766{
767 pud_t *pud = pud_offset(pgd, start);
768
769 /*
770 * Not on a GB page boundary?
771 */
772 if (start & (PUD_SIZE - 1)) {
773 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
774 unsigned long pre_end = min_t(unsigned long, end, next_page);
775
776 unmap_pmd_range(pud, start, pre_end);
777
778 start = pre_end;
779 pud++;
780 }
781
782 /*
783 * Try to unmap in 1G chunks?
784 */
785 while (end - start >= PUD_SIZE) {
786
787 if (pud_large(*pud))
788 pud_clear(pud);
789 else
790 unmap_pmd_range(pud, start, start + PUD_SIZE);
791
792 start += PUD_SIZE;
793 pud++;
794 }
795
796 /*
797 * 2M leftovers?
798 */
799 if (start < end)
800 unmap_pmd_range(pud, start, end);
801
802 /*
803 * No need to try to free the PUD page because we'll free it in
804 * populate_pgd's error path
805 */
806}
807
808static int alloc_pte_page(pmd_t *pmd)
809{
810 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
811 if (!pte)
812 return -1;
813
814 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
815 return 0;
816}
817
818static int alloc_pmd_page(pud_t *pud)
819{
820 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
821 if (!pmd)
822 return -1;
823
824 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
825 return 0;
826}
827
828static void populate_pte(struct cpa_data *cpa,
829 unsigned long start, unsigned long end,
830 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
831{
832 pte_t *pte;
833
834 pte = pte_offset_kernel(pmd, start);
835
836 while (num_pages-- && start < end) {
837
838 /* deal with the NX bit */
839 if (!(pgprot_val(pgprot) & _PAGE_NX))
840 cpa->pfn &= ~_PAGE_NX;
841
842 set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
843
844 start += PAGE_SIZE;
845 cpa->pfn += PAGE_SIZE;
846 pte++;
847 }
848}
849
850static int populate_pmd(struct cpa_data *cpa,
851 unsigned long start, unsigned long end,
852 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
853{
854 unsigned int cur_pages = 0;
855 pmd_t *pmd;
856
857 /*
858 * Not on a 2M boundary?
859 */
860 if (start & (PMD_SIZE - 1)) {
861 unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
862 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
863
864 pre_end = min_t(unsigned long, pre_end, next_page);
865 cur_pages = (pre_end - start) >> PAGE_SHIFT;
866 cur_pages = min_t(unsigned int, num_pages, cur_pages);
867
868 /*
869 * Need a PTE page?
870 */
871 pmd = pmd_offset(pud, start);
872 if (pmd_none(*pmd))
873 if (alloc_pte_page(pmd))
874 return -1;
875
876 populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
877
878 start = pre_end;
879 }
880
881 /*
882 * We mapped them all?
883 */
884 if (num_pages == cur_pages)
885 return cur_pages;
886
887 while (end - start >= PMD_SIZE) {
888
889 /*
890 * We cannot use a 1G page so allocate a PMD page if needed.
891 */
892 if (pud_none(*pud))
893 if (alloc_pmd_page(pud))
894 return -1;
895
896 pmd = pmd_offset(pud, start);
897
898 set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
899
900 start += PMD_SIZE;
901 cpa->pfn += PMD_SIZE;
902 cur_pages += PMD_SIZE >> PAGE_SHIFT;
903 }
904
905 /*
906 * Map trailing 4K pages.
907 */
908 if (start < end) {
909 pmd = pmd_offset(pud, start);
910 if (pmd_none(*pmd))
911 if (alloc_pte_page(pmd))
912 return -1;
913
914 populate_pte(cpa, start, end, num_pages - cur_pages,
915 pmd, pgprot);
916 }
917 return num_pages;
918}
919
920static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
921 pgprot_t pgprot)
922{
923 pud_t *pud;
924 unsigned long end;
925 int cur_pages = 0;
926
927 end = start + (cpa->numpages << PAGE_SHIFT);
928
929 /*
930 * Not on a Gb page boundary? => map everything up to it with
931 * smaller pages.
932 */
933 if (start & (PUD_SIZE - 1)) {
934 unsigned long pre_end;
935 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
936
937 pre_end = min_t(unsigned long, end, next_page);
938 cur_pages = (pre_end - start) >> PAGE_SHIFT;
939 cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
940
941 pud = pud_offset(pgd, start);
942
943 /*
944 * Need a PMD page?
945 */
946 if (pud_none(*pud))
947 if (alloc_pmd_page(pud))
948 return -1;
949
950 cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
951 pud, pgprot);
952 if (cur_pages < 0)
953 return cur_pages;
954
955 start = pre_end;
956 }
957
958 /* We mapped them all? */
959 if (cpa->numpages == cur_pages)
960 return cur_pages;
961
962 pud = pud_offset(pgd, start);
963
964 /*
965 * Map everything starting from the Gb boundary, possibly with 1G pages
966 */
967 while (end - start >= PUD_SIZE) {
968 set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
969
970 start += PUD_SIZE;
971 cpa->pfn += PUD_SIZE;
972 cur_pages += PUD_SIZE >> PAGE_SHIFT;
973 pud++;
974 }
975
976 /* Map trailing leftover */
977 if (start < end) {
978 int tmp;
979
980 pud = pud_offset(pgd, start);
981 if (pud_none(*pud))
982 if (alloc_pmd_page(pud))
983 return -1;
984
985 tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
986 pud, pgprot);
987 if (tmp < 0)
988 return cur_pages;
989
990 cur_pages += tmp;
991 }
992 return cur_pages;
993}
994
995/*
996 * Restrictions for kernel page table do not necessarily apply when mapping in
997 * an alternate PGD.
998 */
999static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1000{
1001 pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1002 bool allocd_pgd = false;
1003 pgd_t *pgd_entry;
1004 pud_t *pud = NULL; /* shut up gcc */
1005 int ret;
1006
1007 pgd_entry = cpa->pgd + pgd_index(addr);
1008
1009 /*
1010 * Allocate a PUD page and hand it down for mapping.
1011 */
1012 if (pgd_none(*pgd_entry)) {
1013 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1014 if (!pud)
1015 return -1;
1016
1017 set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1018 allocd_pgd = true;
1019 }
1020
1021 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1022 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
1023
1024 ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1025 if (ret < 0) {
1026 unmap_pud_range(pgd_entry, addr,
1027 addr + (cpa->numpages << PAGE_SHIFT));
1028
1029 if (allocd_pgd) {
1030 /*
1031 * If I allocated this PUD page, I can just as well
1032 * free it in this error path.
1033 */
1034 pgd_clear(pgd_entry);
1035 free_page((unsigned long)pud);
1036 }
1037 return ret;
1038 }
1039 cpa->numpages = ret;
1040 return 0;
1041}
1042
653static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, 1043static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
654 int primary) 1044 int primary)
655{ 1045{
1046 if (cpa->pgd)
1047 return populate_pgd(cpa, vaddr);
1048
656 /* 1049 /*
657 * Ignore all non primary paths. 1050 * Ignore all non primary paths.
658 */ 1051 */
@@ -697,7 +1090,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
697 else 1090 else
698 address = *cpa->vaddr; 1091 address = *cpa->vaddr;
699repeat: 1092repeat:
700 kpte = lookup_address(address, &level); 1093 kpte = _lookup_address_cpa(cpa, address, &level);
701 if (!kpte) 1094 if (!kpte)
702 return __cpa_process_fault(cpa, address, primary); 1095 return __cpa_process_fault(cpa, address, primary);
703 1096
@@ -761,7 +1154,7 @@ repeat:
761 /* 1154 /*
762 * We have to split the large page: 1155 * We have to split the large page:
763 */ 1156 */
764 err = split_large_page(kpte, address); 1157 err = split_large_page(cpa, kpte, address);
765 if (!err) { 1158 if (!err) {
766 /* 1159 /*
767 * Do a global flush tlb after splitting the large page 1160 * Do a global flush tlb after splitting the large page
@@ -910,6 +1303,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
910 int ret, cache, checkalias; 1303 int ret, cache, checkalias;
911 unsigned long baddr = 0; 1304 unsigned long baddr = 0;
912 1305
1306 memset(&cpa, 0, sizeof(cpa));
1307
913 /* 1308 /*
914 * Check, if we are requested to change a not supported 1309 * Check, if we are requested to change a not supported
915 * feature: 1310 * feature:
@@ -1356,6 +1751,7 @@ static int __set_pages_p(struct page *page, int numpages)
1356{ 1751{
1357 unsigned long tempaddr = (unsigned long) page_address(page); 1752 unsigned long tempaddr = (unsigned long) page_address(page);
1358 struct cpa_data cpa = { .vaddr = &tempaddr, 1753 struct cpa_data cpa = { .vaddr = &tempaddr,
1754 .pgd = NULL,
1359 .numpages = numpages, 1755 .numpages = numpages,
1360 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1756 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1361 .mask_clr = __pgprot(0), 1757 .mask_clr = __pgprot(0),
@@ -1374,6 +1770,7 @@ static int __set_pages_np(struct page *page, int numpages)
1374{ 1770{
1375 unsigned long tempaddr = (unsigned long) page_address(page); 1771 unsigned long tempaddr = (unsigned long) page_address(page);
1376 struct cpa_data cpa = { .vaddr = &tempaddr, 1772 struct cpa_data cpa = { .vaddr = &tempaddr,
1773 .pgd = NULL,
1377 .numpages = numpages, 1774 .numpages = numpages,
1378 .mask_set = __pgprot(0), 1775 .mask_set = __pgprot(0),
1379 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1776 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
@@ -1434,6 +1831,36 @@ bool kernel_page_present(struct page *page)
1434 1831
1435#endif /* CONFIG_DEBUG_PAGEALLOC */ 1832#endif /* CONFIG_DEBUG_PAGEALLOC */
1436 1833
1834int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1835 unsigned numpages, unsigned long page_flags)
1836{
1837 int retval = -EINVAL;
1838
1839 struct cpa_data cpa = {
1840 .vaddr = &address,
1841 .pfn = pfn,
1842 .pgd = pgd,
1843 .numpages = numpages,
1844 .mask_set = __pgprot(0),
1845 .mask_clr = __pgprot(0),
1846 .flags = 0,
1847 };
1848
1849 if (!(__supported_pte_mask & _PAGE_NX))
1850 goto out;
1851
1852 if (!(page_flags & _PAGE_NX))
1853 cpa.mask_clr = __pgprot(_PAGE_NX);
1854
1855 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1856
1857 retval = __change_page_attr_set_clr(&cpa, 0);
1858 __flush_tlb_all();
1859
1860out:
1861 return retval;
1862}
1863
1437/* 1864/*
1438 * The testcases use internal knowledge of the implementation that shouldn't 1865 * The testcases use internal knowledge of the implementation that shouldn't
1439 * be exposed to the rest of the kernel. Include these directly here. 1866 * be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 266ca912f62e..1a25187e151e 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -181,6 +181,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
181 (unsigned long long) start, (unsigned long long) end - 1, 181 (unsigned long long) start, (unsigned long long) end - 1,
182 hotpluggable ? " hotplug" : ""); 182 hotpluggable ? " hotplug" : "");
183 183
184 /* Mark hotplug range in memblock. */
185 if (hotpluggable && memblock_mark_hotplug(start, ma->length))
186 pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
187 (unsigned long long)start, (unsigned long long)end - 1);
188
184 return 0; 189 return 0;
185out_err_bad_srat: 190out_err_bad_srat:
186 bad_srat(); 191 bad_srat();
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 26328e800869..4ed75dd81d05 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -359,15 +359,21 @@ void bpf_jit_compile(struct sk_filter *fp)
359 EMIT2(0x89, 0xd0); /* mov %edx,%eax */ 359 EMIT2(0x89, 0xd0); /* mov %edx,%eax */
360 break; 360 break;
361 case BPF_S_ALU_MOD_K: /* A %= K; */ 361 case BPF_S_ALU_MOD_K: /* A %= K; */
362 if (K == 1) {
363 CLEAR_A();
364 break;
365 }
362 EMIT2(0x31, 0xd2); /* xor %edx,%edx */ 366 EMIT2(0x31, 0xd2); /* xor %edx,%edx */
363 EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ 367 EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
364 EMIT2(0xf7, 0xf1); /* div %ecx */ 368 EMIT2(0xf7, 0xf1); /* div %ecx */
365 EMIT2(0x89, 0xd0); /* mov %edx,%eax */ 369 EMIT2(0x89, 0xd0); /* mov %edx,%eax */
366 break; 370 break;
367 case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ 371 case BPF_S_ALU_DIV_K: /* A /= K */
368 EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ 372 if (K == 1)
369 EMIT(K, 4); 373 break;
370 EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */ 374 EMIT2(0x31, 0xd2); /* xor %edx,%edx */
375 EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
376 EMIT2(0xf7, 0xf1); /* div %ecx */
371 break; 377 break;
372 case BPF_S_ALU_AND_X: 378 case BPF_S_ALU_AND_X:
373 seen |= SEEN_XREG; 379 seen |= SEEN_XREG;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index b046e070e088..bca9e85daaa5 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -5,7 +5,6 @@
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/pci.h> 7#include <linux/pci.h>
8#include <linux/init.h>
9#include <linux/vgaarb.h> 8#include <linux/vgaarb.h>
10#include <asm/pci_x86.h> 9#include <asm/pci_x86.h>
11 10
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 51384ca727ad..84b9d672843d 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -31,6 +31,7 @@
31#include <asm/pci_x86.h> 31#include <asm/pci_x86.h>
32#include <asm/hw_irq.h> 32#include <asm/hw_irq.h>
33#include <asm/io_apic.h> 33#include <asm/io_apic.h>
34#include <asm/intel-mid.h>
34 35
35#define PCIE_CAP_OFFSET 0x100 36#define PCIE_CAP_OFFSET 0x100
36 37
@@ -219,7 +220,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
219 irq_attr.ioapic = mp_find_ioapic(dev->irq); 220 irq_attr.ioapic = mp_find_ioapic(dev->irq);
220 irq_attr.ioapic_pin = dev->irq; 221 irq_attr.ioapic_pin = dev->irq;
221 irq_attr.trigger = 1; /* level */ 222 irq_attr.trigger = 1; /* level */
222 irq_attr.polarity = 1; /* active low */ 223 if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
224 irq_attr.polarity = 0; /* active high */
225 else
226 irq_attr.polarity = 1; /* active low */
223 io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); 227 io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
224 228
225 return 0; 229 return 0;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 5eee4959785d..103e702ec5a7 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -337,7 +337,7 @@ out:
337 return ret; 337 return ret;
338} 338}
339 339
340static void xen_initdom_restore_msi_irqs(struct pci_dev *dev, int irq) 340static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
341{ 341{
342 int ret = 0; 342 int ret = 0;
343 343
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index cceb813044ef..d62ec87a2b26 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -12,6 +12,8 @@
12 * Bibo Mao <bibo.mao@intel.com> 12 * Bibo Mao <bibo.mao@intel.com>
13 * Chandramouli Narayanan <mouli@linux.intel.com> 13 * Chandramouli Narayanan <mouli@linux.intel.com>
14 * Huang Ying <ying.huang@intel.com> 14 * Huang Ying <ying.huang@intel.com>
15 * Copyright (C) 2013 SuSE Labs
16 * Borislav Petkov <bp@suse.de> - runtime services VA mapping
15 * 17 *
16 * Copied from efi_32.c to eliminate the duplicated code between EFI 18 * Copied from efi_32.c to eliminate the duplicated code between EFI
17 * 32/64 support code. --ying 2007-10-26 19 * 32/64 support code. --ying 2007-10-26
@@ -51,7 +53,7 @@
51#include <asm/x86_init.h> 53#include <asm/x86_init.h>
52#include <asm/rtc.h> 54#include <asm/rtc.h>
53 55
54#define EFI_DEBUG 1 56#define EFI_DEBUG
55 57
56#define EFI_MIN_RESERVE 5120 58#define EFI_MIN_RESERVE 5120
57 59
@@ -74,6 +76,8 @@ static __initdata efi_config_table_type_t arch_tables[] = {
74 {NULL_GUID, NULL, NULL}, 76 {NULL_GUID, NULL, NULL},
75}; 77};
76 78
79u64 efi_setup; /* efi setup_data physical address */
80
77/* 81/*
78 * Returns 1 if 'facility' is enabled, 0 otherwise. 82 * Returns 1 if 'facility' is enabled, 0 otherwise.
79 */ 83 */
@@ -110,7 +114,6 @@ static int __init setup_storage_paranoia(char *arg)
110} 114}
111early_param("efi_no_storage_paranoia", setup_storage_paranoia); 115early_param("efi_no_storage_paranoia", setup_storage_paranoia);
112 116
113
114static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) 117static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
115{ 118{
116 unsigned long flags; 119 unsigned long flags;
@@ -398,9 +401,9 @@ int __init efi_memblock_x86_reserve_range(void)
398 return 0; 401 return 0;
399} 402}
400 403
401#if EFI_DEBUG
402static void __init print_efi_memmap(void) 404static void __init print_efi_memmap(void)
403{ 405{
406#ifdef EFI_DEBUG
404 efi_memory_desc_t *md; 407 efi_memory_desc_t *md;
405 void *p; 408 void *p;
406 int i; 409 int i;
@@ -415,8 +418,8 @@ static void __init print_efi_memmap(void)
415 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), 418 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
416 (md->num_pages >> (20 - EFI_PAGE_SHIFT))); 419 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
417 } 420 }
418}
419#endif /* EFI_DEBUG */ 421#endif /* EFI_DEBUG */
422}
420 423
421void __init efi_reserve_boot_services(void) 424void __init efi_reserve_boot_services(void)
422{ 425{
@@ -436,7 +439,7 @@ void __init efi_reserve_boot_services(void)
436 * - Not within any part of the kernel 439 * - Not within any part of the kernel
437 * - Not the bios reserved area 440 * - Not the bios reserved area
438 */ 441 */
439 if ((start+size >= __pa_symbol(_text) 442 if ((start + size > __pa_symbol(_text)
440 && start <= __pa_symbol(_end)) || 443 && start <= __pa_symbol(_end)) ||
441 !e820_all_mapped(start, start+size, E820_RAM) || 444 !e820_all_mapped(start, start+size, E820_RAM) ||
442 memblock_is_region_reserved(start, size)) { 445 memblock_is_region_reserved(start, size)) {
@@ -489,18 +492,27 @@ static int __init efi_systab_init(void *phys)
489{ 492{
490 if (efi_enabled(EFI_64BIT)) { 493 if (efi_enabled(EFI_64BIT)) {
491 efi_system_table_64_t *systab64; 494 efi_system_table_64_t *systab64;
495 struct efi_setup_data *data = NULL;
492 u64 tmp = 0; 496 u64 tmp = 0;
493 497
498 if (efi_setup) {
499 data = early_memremap(efi_setup, sizeof(*data));
500 if (!data)
501 return -ENOMEM;
502 }
494 systab64 = early_ioremap((unsigned long)phys, 503 systab64 = early_ioremap((unsigned long)phys,
495 sizeof(*systab64)); 504 sizeof(*systab64));
496 if (systab64 == NULL) { 505 if (systab64 == NULL) {
497 pr_err("Couldn't map the system table!\n"); 506 pr_err("Couldn't map the system table!\n");
507 if (data)
508 early_iounmap(data, sizeof(*data));
498 return -ENOMEM; 509 return -ENOMEM;
499 } 510 }
500 511
501 efi_systab.hdr = systab64->hdr; 512 efi_systab.hdr = systab64->hdr;
502 efi_systab.fw_vendor = systab64->fw_vendor; 513 efi_systab.fw_vendor = data ? (unsigned long)data->fw_vendor :
503 tmp |= systab64->fw_vendor; 514 systab64->fw_vendor;
515 tmp |= data ? data->fw_vendor : systab64->fw_vendor;
504 efi_systab.fw_revision = systab64->fw_revision; 516 efi_systab.fw_revision = systab64->fw_revision;
505 efi_systab.con_in_handle = systab64->con_in_handle; 517 efi_systab.con_in_handle = systab64->con_in_handle;
506 tmp |= systab64->con_in_handle; 518 tmp |= systab64->con_in_handle;
@@ -514,15 +526,20 @@ static int __init efi_systab_init(void *phys)
514 tmp |= systab64->stderr_handle; 526 tmp |= systab64->stderr_handle;
515 efi_systab.stderr = systab64->stderr; 527 efi_systab.stderr = systab64->stderr;
516 tmp |= systab64->stderr; 528 tmp |= systab64->stderr;
517 efi_systab.runtime = (void *)(unsigned long)systab64->runtime; 529 efi_systab.runtime = data ?
518 tmp |= systab64->runtime; 530 (void *)(unsigned long)data->runtime :
531 (void *)(unsigned long)systab64->runtime;
532 tmp |= data ? data->runtime : systab64->runtime;
519 efi_systab.boottime = (void *)(unsigned long)systab64->boottime; 533 efi_systab.boottime = (void *)(unsigned long)systab64->boottime;
520 tmp |= systab64->boottime; 534 tmp |= systab64->boottime;
521 efi_systab.nr_tables = systab64->nr_tables; 535 efi_systab.nr_tables = systab64->nr_tables;
522 efi_systab.tables = systab64->tables; 536 efi_systab.tables = data ? (unsigned long)data->tables :
523 tmp |= systab64->tables; 537 systab64->tables;
538 tmp |= data ? data->tables : systab64->tables;
524 539
525 early_iounmap(systab64, sizeof(*systab64)); 540 early_iounmap(systab64, sizeof(*systab64));
541 if (data)
542 early_iounmap(data, sizeof(*data));
526#ifdef CONFIG_X86_32 543#ifdef CONFIG_X86_32
527 if (tmp >> 32) { 544 if (tmp >> 32) {
528 pr_err("EFI data located above 4GB, disabling EFI.\n"); 545 pr_err("EFI data located above 4GB, disabling EFI.\n");
@@ -626,6 +643,62 @@ static int __init efi_memmap_init(void)
626 return 0; 643 return 0;
627} 644}
628 645
646/*
647 * A number of config table entries get remapped to virtual addresses
648 * after entering EFI virtual mode. However, the kexec kernel requires
649 * their physical addresses therefore we pass them via setup_data and
650 * correct those entries to their respective physical addresses here.
651 *
652 * Currently only handles smbios which is necessary for some firmware
653 * implementation.
654 */
655static int __init efi_reuse_config(u64 tables, int nr_tables)
656{
657 int i, sz, ret = 0;
658 void *p, *tablep;
659 struct efi_setup_data *data;
660
661 if (!efi_setup)
662 return 0;
663
664 if (!efi_enabled(EFI_64BIT))
665 return 0;
666
667 data = early_memremap(efi_setup, sizeof(*data));
668 if (!data) {
669 ret = -ENOMEM;
670 goto out;
671 }
672
673 if (!data->smbios)
674 goto out_memremap;
675
676 sz = sizeof(efi_config_table_64_t);
677
678 p = tablep = early_memremap(tables, nr_tables * sz);
679 if (!p) {
680 pr_err("Could not map Configuration table!\n");
681 ret = -ENOMEM;
682 goto out_memremap;
683 }
684
685 for (i = 0; i < efi.systab->nr_tables; i++) {
686 efi_guid_t guid;
687
688 guid = ((efi_config_table_64_t *)p)->guid;
689
690 if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
691 ((efi_config_table_64_t *)p)->table = data->smbios;
692 p += sz;
693 }
694 early_iounmap(tablep, nr_tables * sz);
695
696out_memremap:
697 early_iounmap(data, sizeof(*data));
698out:
699 return ret;
700}
701
629void __init efi_init(void) 702void __init efi_init(void)
630{ 703{
631 efi_char16_t *c16; 704 efi_char16_t *c16;
@@ -651,6 +724,10 @@ void __init efi_init(void)
651 724
652 set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility); 725 set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility);
653 726
727 efi.config_table = (unsigned long)efi.systab->tables;
728 efi.fw_vendor = (unsigned long)efi.systab->fw_vendor;
729 efi.runtime = (unsigned long)efi.systab->runtime;
730
654 /* 731 /*
655 * Show what we know for posterity 732 * Show what we know for posterity
656 */ 733 */
@@ -667,6 +744,9 @@ void __init efi_init(void)
667 efi.systab->hdr.revision >> 16, 744 efi.systab->hdr.revision >> 16,
668 efi.systab->hdr.revision & 0xffff, vendor); 745 efi.systab->hdr.revision & 0xffff, vendor);
669 746
747 if (efi_reuse_config(efi.systab->tables, efi.systab->nr_tables))
748 return;
749
670 if (efi_config_init(arch_tables)) 750 if (efi_config_init(arch_tables))
671 return; 751 return;
672 752
@@ -684,15 +764,12 @@ void __init efi_init(void)
684 return; 764 return;
685 set_bit(EFI_RUNTIME_SERVICES, &x86_efi_facility); 765 set_bit(EFI_RUNTIME_SERVICES, &x86_efi_facility);
686 } 766 }
687
688 if (efi_memmap_init()) 767 if (efi_memmap_init())
689 return; 768 return;
690 769
691 set_bit(EFI_MEMMAP, &x86_efi_facility); 770 set_bit(EFI_MEMMAP, &x86_efi_facility);
692 771
693#if EFI_DEBUG
694 print_efi_memmap(); 772 print_efi_memmap();
695#endif
696} 773}
697 774
698void __init efi_late_init(void) 775void __init efi_late_init(void)
@@ -741,36 +818,38 @@ void efi_memory_uc(u64 addr, unsigned long size)
741 set_memory_uc(addr, npages); 818 set_memory_uc(addr, npages);
742} 819}
743 820
744/* 821void __init old_map_region(efi_memory_desc_t *md)
745 * This function will switch the EFI runtime services to virtual mode.
746 * Essentially, look through the EFI memmap and map every region that
747 * has the runtime attribute bit set in its memory descriptor and update
748 * that memory descriptor with the virtual address obtained from ioremap().
749 * This enables the runtime services to be called without having to
750 * thunk back into physical mode for every invocation.
751 */
752void __init efi_enter_virtual_mode(void)
753{ 822{
754 efi_memory_desc_t *md, *prev_md = NULL; 823 u64 start_pfn, end_pfn, end;
755 efi_status_t status;
756 unsigned long size; 824 unsigned long size;
757 u64 end, systab, start_pfn, end_pfn; 825 void *va;
758 void *p, *va, *new_memmap = NULL;
759 int count = 0;
760 826
761 efi.systab = NULL; 827 start_pfn = PFN_DOWN(md->phys_addr);
828 size = md->num_pages << PAGE_SHIFT;
829 end = md->phys_addr + size;
830 end_pfn = PFN_UP(end);
762 831
763 /* 832 if (pfn_range_is_mapped(start_pfn, end_pfn)) {
764 * We don't do virtual mode, since we don't do runtime services, on 833 va = __va(md->phys_addr);
765 * non-native EFI
766 */
767 834
768 if (!efi_is_native()) { 835 if (!(md->attribute & EFI_MEMORY_WB))
769 efi_unmap_memmap(); 836 efi_memory_uc((u64)(unsigned long)va, size);
770 return; 837 } else
771 } 838 va = efi_ioremap(md->phys_addr, size,
839 md->type, md->attribute);
840
841 md->virt_addr = (u64) (unsigned long) va;
842 if (!va)
843 pr_err("ioremap of 0x%llX failed!\n",
844 (unsigned long long)md->phys_addr);
845}
846
847/* Merge contiguous regions of the same type and attribute */
848static void __init efi_merge_regions(void)
849{
850 void *p;
851 efi_memory_desc_t *md, *prev_md = NULL;
772 852
773 /* Merge contiguous regions of the same type and attribute */
774 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 853 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
775 u64 prev_size; 854 u64 prev_size;
776 md = p; 855 md = p;
@@ -796,6 +875,77 @@ void __init efi_enter_virtual_mode(void)
796 } 875 }
797 prev_md = md; 876 prev_md = md;
798 } 877 }
878}
879
880static void __init get_systab_virt_addr(efi_memory_desc_t *md)
881{
882 unsigned long size;
883 u64 end, systab;
884
885 size = md->num_pages << EFI_PAGE_SHIFT;
886 end = md->phys_addr + size;
887 systab = (u64)(unsigned long)efi_phys.systab;
888 if (md->phys_addr <= systab && systab < end) {
889 systab += md->virt_addr - md->phys_addr;
890 efi.systab = (efi_system_table_t *)(unsigned long)systab;
891 }
892}
893
894static int __init save_runtime_map(void)
895{
896 efi_memory_desc_t *md;
897 void *tmp, *p, *q = NULL;
898 int count = 0;
899
900 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
901 md = p;
902
903 if (!(md->attribute & EFI_MEMORY_RUNTIME) ||
904 (md->type == EFI_BOOT_SERVICES_CODE) ||
905 (md->type == EFI_BOOT_SERVICES_DATA))
906 continue;
907 tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL);
908 if (!tmp)
909 goto out;
910 q = tmp;
911
912 memcpy(q + count * memmap.desc_size, md, memmap.desc_size);
913 count++;
914 }
915
916 efi_runtime_map_setup(q, count, memmap.desc_size);
917
918 return 0;
919out:
920 kfree(q);
921 return -ENOMEM;
922}
923
924/*
925 * Map efi regions which were passed via setup_data. The virt_addr is a fixed
926 * addr which was used in first kernel of a kexec boot.
927 */
928static void __init efi_map_regions_fixed(void)
929{
930 void *p;
931 efi_memory_desc_t *md;
932
933 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
934 md = p;
935 efi_map_region_fixed(md); /* FIXME: add error handling */
936 get_systab_virt_addr(md);
937 }
938
939}
940
941/*
942 * Map efi memory ranges for runtime serivce and update new_memmap with virtual
943 * addresses.
944 */
945static void * __init efi_map_regions(int *count)
946{
947 efi_memory_desc_t *md;
948 void *p, *tmp, *new_memmap = NULL;
799 949
800 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 950 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
801 md = p; 951 md = p;
@@ -807,53 +957,95 @@ void __init efi_enter_virtual_mode(void)
807 continue; 957 continue;
808 } 958 }
809 959
810 size = md->num_pages << EFI_PAGE_SHIFT; 960 efi_map_region(md);
811 end = md->phys_addr + size; 961 get_systab_virt_addr(md);
812 962
813 start_pfn = PFN_DOWN(md->phys_addr); 963 tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size,
814 end_pfn = PFN_UP(end); 964 GFP_KERNEL);
815 if (pfn_range_is_mapped(start_pfn, end_pfn)) { 965 if (!tmp)
816 va = __va(md->phys_addr); 966 goto out;
967 new_memmap = tmp;
968 memcpy(new_memmap + (*count * memmap.desc_size), md,
969 memmap.desc_size);
970 (*count)++;
971 }
817 972
818 if (!(md->attribute & EFI_MEMORY_WB)) 973 return new_memmap;
819 efi_memory_uc((u64)(unsigned long)va, size); 974out:
820 } else 975 kfree(new_memmap);
821 va = efi_ioremap(md->phys_addr, size, 976 return NULL;
822 md->type, md->attribute); 977}
978
979/*
980 * This function will switch the EFI runtime services to virtual mode.
981 * Essentially, we look through the EFI memmap and map every region that
982 * has the runtime attribute bit set in its memory descriptor into the
983 * ->trampoline_pgd page table using a top-down VA allocation scheme.
984 *
985 * The old method which used to update that memory descriptor with the
986 * virtual address obtained from ioremap() is still supported when the
987 * kernel is booted with efi=old_map on its command line. Same old
988 * method enabled the runtime services to be called without having to
989 * thunk back into physical mode for every invocation.
990 *
991 * The new method does a pagetable switch in a preemption-safe manner
992 * so that we're in a different address space when calling a runtime
993 * function. For function arguments passing we do copy the PGDs of the
994 * kernel page table into ->trampoline_pgd prior to each call.
995 *
996 * Specially for kexec boot, efi runtime maps in previous kernel should
997 * be passed in via setup_data. In that case runtime ranges will be mapped
998 * to the same virtual addresses as the first kernel.
999 */
1000void __init efi_enter_virtual_mode(void)
1001{
1002 efi_status_t status;
1003 void *new_memmap = NULL;
1004 int err, count = 0;
823 1005
824 md->virt_addr = (u64) (unsigned long) va; 1006 efi.systab = NULL;
825 1007
826 if (!va) { 1008 /*
827 pr_err("ioremap of 0x%llX failed!\n", 1009 * We don't do virtual mode, since we don't do runtime services, on
828 (unsigned long long)md->phys_addr); 1010 * non-native EFI
829 continue; 1011 */
830 } 1012 if (!efi_is_native()) {
1013 efi_unmap_memmap();
1014 return;
1015 }
831 1016
832 systab = (u64) (unsigned long) efi_phys.systab; 1017 if (efi_setup) {
833 if (md->phys_addr <= systab && systab < end) { 1018 efi_map_regions_fixed();
834 systab += md->virt_addr - md->phys_addr; 1019 } else {
835 efi.systab = (efi_system_table_t *) (unsigned long) systab; 1020 efi_merge_regions();
1021 new_memmap = efi_map_regions(&count);
1022 if (!new_memmap) {
1023 pr_err("Error reallocating memory, EFI runtime non-functional!\n");
1024 return;
836 } 1025 }
837 new_memmap = krealloc(new_memmap,
838 (count + 1) * memmap.desc_size,
839 GFP_KERNEL);
840 memcpy(new_memmap + (count * memmap.desc_size), md,
841 memmap.desc_size);
842 count++;
843 } 1026 }
844 1027
1028 err = save_runtime_map();
1029 if (err)
1030 pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n");
1031
845 BUG_ON(!efi.systab); 1032 BUG_ON(!efi.systab);
846 1033
847 status = phys_efi_set_virtual_address_map( 1034 efi_setup_page_tables();
848 memmap.desc_size * count, 1035 efi_sync_low_kernel_mappings();
849 memmap.desc_size,
850 memmap.desc_version,
851 (efi_memory_desc_t *)__pa(new_memmap));
852 1036
853 if (status != EFI_SUCCESS) { 1037 if (!efi_setup) {
854 pr_alert("Unable to switch EFI into virtual mode " 1038 status = phys_efi_set_virtual_address_map(
855 "(status=%lx)!\n", status); 1039 memmap.desc_size * count,
856 panic("EFI call to SetVirtualAddressMap() failed!"); 1040 memmap.desc_size,
1041 memmap.desc_version,
1042 (efi_memory_desc_t *)__pa(new_memmap));
1043
1044 if (status != EFI_SUCCESS) {
1045 pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n",
1046 status);
1047 panic("EFI call to SetVirtualAddressMap() failed!");
1048 }
857 } 1049 }
858 1050
859 /* 1051 /*
@@ -876,7 +1068,8 @@ void __init efi_enter_virtual_mode(void)
876 efi.query_variable_info = virt_efi_query_variable_info; 1068 efi.query_variable_info = virt_efi_query_variable_info;
877 efi.update_capsule = virt_efi_update_capsule; 1069 efi.update_capsule = virt_efi_update_capsule;
878 efi.query_capsule_caps = virt_efi_query_capsule_caps; 1070 efi.query_capsule_caps = virt_efi_query_capsule_caps;
879 if (__supported_pte_mask & _PAGE_NX) 1071
1072 if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
880 runtime_code_page_mkexec(); 1073 runtime_code_page_mkexec();
881 1074
882 kfree(new_memmap); 1075 kfree(new_memmap);
@@ -1006,3 +1199,15 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
1006 return EFI_SUCCESS; 1199 return EFI_SUCCESS;
1007} 1200}
1008EXPORT_SYMBOL_GPL(efi_query_variable_store); 1201EXPORT_SYMBOL_GPL(efi_query_variable_store);
1202
1203static int __init parse_efi_cmdline(char *str)
1204{
1205 if (*str == '=')
1206 str++;
1207
1208 if (!strncmp(str, "old_map", 7))
1209 set_bit(EFI_OLD_MEMMAP, &x86_efi_facility);
1210
1211 return 0;
1212}
1213early_param("efi", parse_efi_cmdline);
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 40e446941dd7..249b183cf417 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -37,9 +37,19 @@
37 * claim EFI runtime service handler exclusively and to duplicate a memory in 37 * claim EFI runtime service handler exclusively and to duplicate a memory in
38 * low memory space say 0 - 3G. 38 * low memory space say 0 - 3G.
39 */ 39 */
40
41static unsigned long efi_rt_eflags; 40static unsigned long efi_rt_eflags;
42 41
42void efi_sync_low_kernel_mappings(void) {}
43void efi_setup_page_tables(void) {}
44
45void __init efi_map_region(efi_memory_desc_t *md)
46{
47 old_map_region(md);
48}
49
50void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
51void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
52
43void efi_call_phys_prelog(void) 53void efi_call_phys_prelog(void)
44{ 54{
45 struct desc_ptr gdt_descr; 55 struct desc_ptr gdt_descr;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 39a0e7f1f0a3..6284f158a47d 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -38,10 +38,28 @@
38#include <asm/efi.h> 38#include <asm/efi.h>
39#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
40#include <asm/fixmap.h> 40#include <asm/fixmap.h>
41#include <asm/realmode.h>
41 42
42static pgd_t *save_pgd __initdata; 43static pgd_t *save_pgd __initdata;
43static unsigned long efi_flags __initdata; 44static unsigned long efi_flags __initdata;
44 45
46/*
47 * We allocate runtime services regions bottom-up, starting from -4G, i.e.
48 * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
49 */
50static u64 efi_va = -4 * (1UL << 30);
51#define EFI_VA_END (-68 * (1UL << 30))
52
53/*
54 * Scratch space used for switching the pagetable in the EFI stub
55 */
56struct efi_scratch {
57 u64 r15;
58 u64 prev_cr3;
59 pgd_t *efi_pgt;
60 bool use_pgd;
61};
62
45static void __init early_code_mapping_set_exec(int executable) 63static void __init early_code_mapping_set_exec(int executable)
46{ 64{
47 efi_memory_desc_t *md; 65 efi_memory_desc_t *md;
@@ -65,6 +83,9 @@ void __init efi_call_phys_prelog(void)
65 int pgd; 83 int pgd;
66 int n_pgds; 84 int n_pgds;
67 85
86 if (!efi_enabled(EFI_OLD_MEMMAP))
87 return;
88
68 early_code_mapping_set_exec(1); 89 early_code_mapping_set_exec(1);
69 local_irq_save(efi_flags); 90 local_irq_save(efi_flags);
70 91
@@ -86,6 +107,10 @@ void __init efi_call_phys_epilog(void)
86 */ 107 */
87 int pgd; 108 int pgd;
88 int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); 109 int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
110
111 if (!efi_enabled(EFI_OLD_MEMMAP))
112 return;
113
89 for (pgd = 0; pgd < n_pgds; pgd++) 114 for (pgd = 0; pgd < n_pgds; pgd++)
90 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); 115 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
91 kfree(save_pgd); 116 kfree(save_pgd);
@@ -94,6 +119,96 @@ void __init efi_call_phys_epilog(void)
94 early_code_mapping_set_exec(0); 119 early_code_mapping_set_exec(0);
95} 120}
96 121
122/*
123 * Add low kernel mappings for passing arguments to EFI functions.
124 */
125void efi_sync_low_kernel_mappings(void)
126{
127 unsigned num_pgds;
128 pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
129
130 if (efi_enabled(EFI_OLD_MEMMAP))
131 return;
132
133 num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
134
135 memcpy(pgd + pgd_index(PAGE_OFFSET),
136 init_mm.pgd + pgd_index(PAGE_OFFSET),
137 sizeof(pgd_t) * num_pgds);
138}
139
140void efi_setup_page_tables(void)
141{
142 efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
143
144 if (!efi_enabled(EFI_OLD_MEMMAP))
145 efi_scratch.use_pgd = true;
146}
147
148static void __init __map_region(efi_memory_desc_t *md, u64 va)
149{
150 pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
151 unsigned long pf = 0;
152
153 if (!(md->attribute & EFI_MEMORY_WB))
154 pf |= _PAGE_PCD;
155
156 if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
157 pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
158 md->phys_addr, va);
159}
160
161void __init efi_map_region(efi_memory_desc_t *md)
162{
163 unsigned long size = md->num_pages << PAGE_SHIFT;
164 u64 pa = md->phys_addr;
165
166 if (efi_enabled(EFI_OLD_MEMMAP))
167 return old_map_region(md);
168
169 /*
170 * Make sure the 1:1 mappings are present as a catch-all for b0rked
171 * firmware which doesn't update all internal pointers after switching
172 * to virtual mode and would otherwise crap on us.
173 */
174 __map_region(md, md->phys_addr);
175
176 efi_va -= size;
177
178 /* Is PA 2M-aligned? */
179 if (!(pa & (PMD_SIZE - 1))) {
180 efi_va &= PMD_MASK;
181 } else {
182 u64 pa_offset = pa & (PMD_SIZE - 1);
183 u64 prev_va = efi_va;
184
185 /* get us the same offset within this 2M page */
186 efi_va = (efi_va & PMD_MASK) + pa_offset;
187
188 if (efi_va > prev_va)
189 efi_va -= PMD_SIZE;
190 }
191
192 if (efi_va < EFI_VA_END) {
193 pr_warn(FW_WARN "VA address range overflow!\n");
194 return;
195 }
196
197 /* Do the VA map */
198 __map_region(md, efi_va);
199 md->virt_addr = efi_va;
200}
201
202/*
203 * kexec kernel will use efi_map_region_fixed to map efi runtime memory ranges.
204 * md->virt_addr is the original virtual address which had been mapped in kexec
205 * 1st kernel.
206 */
207void __init efi_map_region_fixed(efi_memory_desc_t *md)
208{
209 __map_region(md, md->virt_addr);
210}
211
97void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, 212void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
98 u32 type, u64 attribute) 213 u32 type, u64 attribute)
99{ 214{
@@ -113,3 +228,8 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
113 228
114 return (void __iomem *)__va(phys_addr); 229 return (void __iomem *)__va(phys_addr);
115} 230}
231
232void __init parse_efi_setup(u64 phys_addr, u32 data_len)
233{
234 efi_setup = phys_addr + sizeof(struct setup_data);
235}
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 4c07ccab8146..88073b140298 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -34,10 +34,47 @@
34 mov %rsi, %cr0; \ 34 mov %rsi, %cr0; \
35 mov (%rsp), %rsp 35 mov (%rsp), %rsp
36 36
37 /* stolen from gcc */
38 .macro FLUSH_TLB_ALL
39 movq %r15, efi_scratch(%rip)
40 movq %r14, efi_scratch+8(%rip)
41 movq %cr4, %r15
42 movq %r15, %r14
43 andb $0x7f, %r14b
44 movq %r14, %cr4
45 movq %r15, %cr4
46 movq efi_scratch+8(%rip), %r14
47 movq efi_scratch(%rip), %r15
48 .endm
49
50 .macro SWITCH_PGT
51 cmpb $0, efi_scratch+24(%rip)
52 je 1f
53 movq %r15, efi_scratch(%rip) # r15
54 # save previous CR3
55 movq %cr3, %r15
56 movq %r15, efi_scratch+8(%rip) # prev_cr3
57 movq efi_scratch+16(%rip), %r15 # EFI pgt
58 movq %r15, %cr3
59 1:
60 .endm
61
62 .macro RESTORE_PGT
63 cmpb $0, efi_scratch+24(%rip)
64 je 2f
65 movq efi_scratch+8(%rip), %r15
66 movq %r15, %cr3
67 movq efi_scratch(%rip), %r15
68 FLUSH_TLB_ALL
69 2:
70 .endm
71
37ENTRY(efi_call0) 72ENTRY(efi_call0)
38 SAVE_XMM 73 SAVE_XMM
39 subq $32, %rsp 74 subq $32, %rsp
75 SWITCH_PGT
40 call *%rdi 76 call *%rdi
77 RESTORE_PGT
41 addq $32, %rsp 78 addq $32, %rsp
42 RESTORE_XMM 79 RESTORE_XMM
43 ret 80 ret
@@ -47,7 +84,9 @@ ENTRY(efi_call1)
47 SAVE_XMM 84 SAVE_XMM
48 subq $32, %rsp 85 subq $32, %rsp
49 mov %rsi, %rcx 86 mov %rsi, %rcx
87 SWITCH_PGT
50 call *%rdi 88 call *%rdi
89 RESTORE_PGT
51 addq $32, %rsp 90 addq $32, %rsp
52 RESTORE_XMM 91 RESTORE_XMM
53 ret 92 ret
@@ -57,7 +96,9 @@ ENTRY(efi_call2)
57 SAVE_XMM 96 SAVE_XMM
58 subq $32, %rsp 97 subq $32, %rsp
59 mov %rsi, %rcx 98 mov %rsi, %rcx
99 SWITCH_PGT
60 call *%rdi 100 call *%rdi
101 RESTORE_PGT
61 addq $32, %rsp 102 addq $32, %rsp
62 RESTORE_XMM 103 RESTORE_XMM
63 ret 104 ret
@@ -68,7 +109,9 @@ ENTRY(efi_call3)
68 subq $32, %rsp 109 subq $32, %rsp
69 mov %rcx, %r8 110 mov %rcx, %r8
70 mov %rsi, %rcx 111 mov %rsi, %rcx
112 SWITCH_PGT
71 call *%rdi 113 call *%rdi
114 RESTORE_PGT
72 addq $32, %rsp 115 addq $32, %rsp
73 RESTORE_XMM 116 RESTORE_XMM
74 ret 117 ret
@@ -80,7 +123,9 @@ ENTRY(efi_call4)
80 mov %r8, %r9 123 mov %r8, %r9
81 mov %rcx, %r8 124 mov %rcx, %r8
82 mov %rsi, %rcx 125 mov %rsi, %rcx
126 SWITCH_PGT
83 call *%rdi 127 call *%rdi
128 RESTORE_PGT
84 addq $32, %rsp 129 addq $32, %rsp
85 RESTORE_XMM 130 RESTORE_XMM
86 ret 131 ret
@@ -93,7 +138,9 @@ ENTRY(efi_call5)
93 mov %r8, %r9 138 mov %r8, %r9
94 mov %rcx, %r8 139 mov %rcx, %r8
95 mov %rsi, %rcx 140 mov %rsi, %rcx
141 SWITCH_PGT
96 call *%rdi 142 call *%rdi
143 RESTORE_PGT
97 addq $48, %rsp 144 addq $48, %rsp
98 RESTORE_XMM 145 RESTORE_XMM
99 ret 146 ret
@@ -109,8 +156,15 @@ ENTRY(efi_call6)
109 mov %r8, %r9 156 mov %r8, %r9
110 mov %rcx, %r8 157 mov %rcx, %r8
111 mov %rsi, %rcx 158 mov %rsi, %rcx
159 SWITCH_PGT
112 call *%rdi 160 call *%rdi
161 RESTORE_PGT
113 addq $48, %rsp 162 addq $48, %rsp
114 RESTORE_XMM 163 RESTORE_XMM
115 ret 164 ret
116ENDPROC(efi_call6) 165ENDPROC(efi_call6)
166
167 .data
168ENTRY(efi_scratch)
169 .fill 3,8,0
170 .byte 0
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index 01cc29ea5ff7..0a8ee703b9fa 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,6 +1,6 @@
1obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o 1obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o
2obj-$(CONFIG_X86_INTEL_MID) += intel_mid_vrtc.o
3obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o 2obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o
3
4# SFI specific code 4# SFI specific code
5ifdef CONFIG_X86_INTEL_MID 5ifdef CONFIG_X86_INTEL_MID
6obj-$(CONFIG_SFI) += sfi.o device_libs/ 6obj-$(CONFIG_SFI) += sfi.o device_libs/
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
index 0d942c1d26d5..69a783689d21 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
@@ -22,7 +22,9 @@ static void __init *emc1403_platform_data(void *info)
22 int intr = get_gpio_by_name("thermal_int"); 22 int intr = get_gpio_by_name("thermal_int");
23 int intr2nd = get_gpio_by_name("thermal_alert"); 23 int intr2nd = get_gpio_by_name("thermal_alert");
24 24
25 if (intr == -1 || intr2nd == -1) 25 if (intr < 0)
26 return NULL;
27 if (intr2nd < 0)
26 return NULL; 28 return NULL;
27 29
28 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; 30 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
index a013a4834bbe..dccae6b0413f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -66,7 +66,7 @@ static int __init pb_keys_init(void)
66 gb[i].gpio = get_gpio_by_name(gb[i].desc); 66 gb[i].gpio = get_gpio_by_name(gb[i].desc);
67 pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, 67 pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc,
68 gb[i].gpio); 68 gb[i].gpio);
69 if (gb[i].gpio == -1) 69 if (gb[i].gpio < 0)
70 continue; 70 continue;
71 71
72 if (i != good) 72 if (i != good)
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
index 15278c11f714..54226de7541a 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
@@ -21,7 +21,9 @@ static void __init *lis331dl_platform_data(void *info)
21 int intr = get_gpio_by_name("accel_int"); 21 int intr = get_gpio_by_name("accel_int");
22 int intr2nd = get_gpio_by_name("accel_2"); 22 int intr2nd = get_gpio_by_name("accel_2");
23 23
24 if (intr == -1 || intr2nd == -1) 24 if (intr < 0)
25 return NULL;
26 if (intr2nd < 0)
25 return NULL; 27 return NULL;
26 28
27 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; 29 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
index 94ade10024ae..2c8acbc1e9ad 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -48,7 +48,7 @@ static void __init *max7315_platform_data(void *info)
48 gpio_base = get_gpio_by_name(base_pin_name); 48 gpio_base = get_gpio_by_name(base_pin_name);
49 intr = get_gpio_by_name(intr_pin_name); 49 intr = get_gpio_by_name(intr_pin_name);
50 50
51 if (gpio_base == -1) 51 if (gpio_base < 0)
52 return NULL; 52 return NULL;
53 max7315->gpio_base = gpio_base; 53 max7315->gpio_base = gpio_base;
54 if (intr != -1) { 54 if (intr != -1) {
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
index dd28d63c84fb..cfe9a47a1e87 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
@@ -19,7 +19,7 @@ static void *mpu3050_platform_data(void *info)
19 struct i2c_board_info *i2c_info = info; 19 struct i2c_board_info *i2c_info = info;
20 int intr = get_gpio_by_name("mpu3050_int"); 20 int intr = get_gpio_by_name("mpu3050_int");
21 21
22 if (intr == -1) 22 if (intr < 0)
23 return NULL; 23 return NULL;
24 24
25 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; 25 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
index d87182a09263..65c2a9a19db4 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
@@ -26,7 +26,7 @@ static void __init *pmic_gpio_platform_data(void *info)
26 static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; 26 static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
27 int gpio_base = get_gpio_by_name("pmic_gpio_base"); 27 int gpio_base = get_gpio_by_name("pmic_gpio_base");
28 28
29 if (gpio_base == -1) 29 if (gpio_base < 0)
30 gpio_base = 64; 30 gpio_base = 64;
31 pmic_gpio_pdata.gpio_base = gpio_base; 31 pmic_gpio_pdata.gpio_base = gpio_base;
32 pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; 32 pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
index 22881c9a6737..33be0b3be6e1 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
@@ -34,10 +34,10 @@ static void *tca6416_platform_data(void *info)
34 gpio_base = get_gpio_by_name(base_pin_name); 34 gpio_base = get_gpio_by_name(base_pin_name);
35 intr = get_gpio_by_name(intr_pin_name); 35 intr = get_gpio_by_name(intr_pin_name);
36 36
37 if (gpio_base == -1) 37 if (gpio_base < 0)
38 return NULL; 38 return NULL;
39 tca6416.gpio_base = gpio_base; 39 tca6416.gpio_base = gpio_base;
40 if (intr != -1) { 40 if (intr >= 0) {
41 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; 41 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
42 tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; 42 tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
43 } else { 43 } else {
diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
index 4f702f554f6e..e0bd082a80e0 100644
--- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c
+++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
@@ -22,7 +22,6 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/delay.h> 24#include <linux/delay.h>
25#include <linux/init.h>
26#include <linux/io.h> 25#include <linux/io.h>
27 26
28#include <asm/fixmap.h> 27#include <asm/fixmap.h>
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index f90e290f689f..1bbedc4b0f88 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -35,6 +35,8 @@
35#include <asm/apb_timer.h> 35#include <asm/apb_timer.h>
36#include <asm/reboot.h> 36#include <asm/reboot.h>
37 37
38#include "intel_mid_weak_decls.h"
39
38/* 40/*
39 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, 41 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
40 * cmdline option x86_intel_mid_timer can be used to override the configuration 42 * cmdline option x86_intel_mid_timer can be used to override the configuration
@@ -58,12 +60,16 @@
58 60
59enum intel_mid_timer_options intel_mid_timer_options; 61enum intel_mid_timer_options intel_mid_timer_options;
60 62
63/* intel_mid_ops to store sub arch ops */
64struct intel_mid_ops *intel_mid_ops;
65/* getter function for sub arch ops*/
66static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT;
61enum intel_mid_cpu_type __intel_mid_cpu_chip; 67enum intel_mid_cpu_type __intel_mid_cpu_chip;
62EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); 68EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
63 69
64static void intel_mid_power_off(void) 70static void intel_mid_power_off(void)
65{ 71{
66} 72};
67 73
68static void intel_mid_reboot(void) 74static void intel_mid_reboot(void)
69{ 75{
@@ -72,32 +78,6 @@ static void intel_mid_reboot(void)
72 78
73static unsigned long __init intel_mid_calibrate_tsc(void) 79static unsigned long __init intel_mid_calibrate_tsc(void)
74{ 80{
75 unsigned long fast_calibrate;
76 u32 lo, hi, ratio, fsb;
77
78 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
79 pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
80 ratio = (hi >> 8) & 0x1f;
81 pr_debug("ratio is %d\n", ratio);
82 if (!ratio) {
83 pr_err("read a zero ratio, should be incorrect!\n");
84 pr_err("force tsc ratio to 16 ...\n");
85 ratio = 16;
86 }
87 rdmsr(MSR_FSB_FREQ, lo, hi);
88 if ((lo & 0x7) == 0x7)
89 fsb = PENWELL_FSB_FREQ_83SKU;
90 else
91 fsb = PENWELL_FSB_FREQ_100SKU;
92 fast_calibrate = ratio * fsb;
93 pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
94 lapic_timer_frequency = fsb * 1000 / HZ;
95 /* mark tsc clocksource as reliable */
96 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
97
98 if (fast_calibrate)
99 return fast_calibrate;
100
101 return 0; 81 return 0;
102} 82}
103 83
@@ -125,13 +105,37 @@ static void __init intel_mid_time_init(void)
125 105
126static void intel_mid_arch_setup(void) 106static void intel_mid_arch_setup(void)
127{ 107{
128 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) 108 if (boot_cpu_data.x86 != 6) {
129 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
130 else {
131 pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n", 109 pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n",
132 boot_cpu_data.x86, boot_cpu_data.x86_model); 110 boot_cpu_data.x86, boot_cpu_data.x86_model);
133 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; 111 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
112 goto out;
134 } 113 }
114
115 switch (boot_cpu_data.x86_model) {
116 case 0x35:
117 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW;
118 break;
119 case 0x3C:
120 case 0x4A:
121 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER;
122 break;
123 case 0x27:
124 default:
125 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
126 break;
127 }
128
129 if (__intel_mid_cpu_chip < MAX_CPU_OPS(get_intel_mid_ops))
130 intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
131 else {
132 intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
133 pr_info("ARCH: Uknown SoC, assuming PENWELL!\n");
134 }
135
136out:
137 if (intel_mid_ops->arch_setup)
138 intel_mid_ops->arch_setup();
135} 139}
136 140
137/* MID systems don't have i8042 controller */ 141/* MID systems don't have i8042 controller */
diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
new file mode 100644
index 000000000000..a537ffc16299
--- /dev/null
+++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
@@ -0,0 +1,19 @@
1/*
2 * intel_mid_weak_decls.h: Weak declarations of intel-mid.c
3 *
4 * (C) Copyright 2013 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12
13/* __attribute__((weak)) makes these declarations overridable */
14/* For every CPU addition a new get_<cpuname>_ops interface needs
15 * to be added.
16 */
17extern void * __cpuinit get_penwell_ops(void) __attribute__((weak));
18extern void * __cpuinit get_cloverview_ops(void) __attribute__((weak));
19extern void * __init get_tangier_ops(void) __attribute__((weak));
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
new file mode 100644
index 000000000000..4f7884eebc14
--- /dev/null
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -0,0 +1,75 @@
1/*
2 * mfld.c: Intel Medfield platform setup code
3 *
4 * (C) Copyright 2013 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12#include <linux/init.h>
13
14#include <asm/apic.h>
15#include <asm/intel-mid.h>
16#include <asm/intel_mid_vrtc.h>
17
18#include "intel_mid_weak_decls.h"
19
20static void penwell_arch_setup(void);
21/* penwell arch ops */
22static struct intel_mid_ops penwell_ops = {
23 .arch_setup = penwell_arch_setup,
24};
25
26static void mfld_power_off(void)
27{
28}
29
30static unsigned long __init mfld_calibrate_tsc(void)
31{
32 unsigned long fast_calibrate;
33 u32 lo, hi, ratio, fsb;
34
35 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
36 pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
37 ratio = (hi >> 8) & 0x1f;
38 pr_debug("ratio is %d\n", ratio);
39 if (!ratio) {
40 pr_err("read a zero ratio, should be incorrect!\n");
41 pr_err("force tsc ratio to 16 ...\n");
42 ratio = 16;
43 }
44 rdmsr(MSR_FSB_FREQ, lo, hi);
45 if ((lo & 0x7) == 0x7)
46 fsb = FSB_FREQ_83SKU;
47 else
48 fsb = FSB_FREQ_100SKU;
49 fast_calibrate = ratio * fsb;
50 pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
51 lapic_timer_frequency = fsb * 1000 / HZ;
52 /* mark tsc clocksource as reliable */
53 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
54
55 if (fast_calibrate)
56 return fast_calibrate;
57
58 return 0;
59}
60
61static void __init penwell_arch_setup()
62{
63 x86_platform.calibrate_tsc = mfld_calibrate_tsc;
64 pm_power_off = mfld_power_off;
65}
66
67void * __cpuinit get_penwell_ops()
68{
69 return &penwell_ops;
70}
71
72void * __cpuinit get_cloverview_ops()
73{
74 return &penwell_ops;
75}
diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c
new file mode 100644
index 000000000000..09d10159e7b7
--- /dev/null
+++ b/arch/x86/platform/intel-mid/mrfl.c
@@ -0,0 +1,103 @@
1/*
2 * mrfl.c: Intel Merrifield platform specific setup code
3 *
4 * (C) Copyright 2013 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12#include <linux/init.h>
13
14#include <asm/apic.h>
15#include <asm/intel-mid.h>
16
17#include "intel_mid_weak_decls.h"
18
19static unsigned long __init tangier_calibrate_tsc(void)
20{
21 unsigned long fast_calibrate;
22 u32 lo, hi, ratio, fsb, bus_freq;
23
24 /* *********************** */
25 /* Compute TSC:Ratio * FSB */
26 /* *********************** */
27
28 /* Compute Ratio */
29 rdmsr(MSR_PLATFORM_INFO, lo, hi);
30 pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo);
31
32 ratio = (lo >> 8) & 0xFF;
33 pr_debug("ratio is %d\n", ratio);
34 if (!ratio) {
35 pr_err("Read a zero ratio, force tsc ratio to 4 ...\n");
36 ratio = 4;
37 }
38
39 /* Compute FSB */
40 rdmsr(MSR_FSB_FREQ, lo, hi);
41 pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n",
42 hi, lo);
43
44 bus_freq = lo & 0x7;
45 pr_debug("bus_freq = 0x%x\n", bus_freq);
46
47 if (bus_freq == 0)
48 fsb = FSB_FREQ_100SKU;
49 else if (bus_freq == 1)
50 fsb = FSB_FREQ_100SKU;
51 else if (bus_freq == 2)
52 fsb = FSB_FREQ_133SKU;
53 else if (bus_freq == 3)
54 fsb = FSB_FREQ_167SKU;
55 else if (bus_freq == 4)
56 fsb = FSB_FREQ_83SKU;
57 else if (bus_freq == 5)
58 fsb = FSB_FREQ_400SKU;
59 else if (bus_freq == 6)
60 fsb = FSB_FREQ_267SKU;
61 else if (bus_freq == 7)
62 fsb = FSB_FREQ_333SKU;
63 else {
64 BUG();
65 pr_err("Invalid bus_freq! Setting to minimal value!\n");
66 fsb = FSB_FREQ_100SKU;
67 }
68
69 /* TSC = FSB Freq * Resolved HFM Ratio */
70 fast_calibrate = ratio * fsb;
71 pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate);
72
73 /* ************************************ */
74 /* Calculate Local APIC Timer Frequency */
75 /* ************************************ */
76 lapic_timer_frequency = (fsb * 1000) / HZ;
77
78 pr_debug("Setting lapic_timer_frequency = %d\n",
79 lapic_timer_frequency);
80
81 /* mark tsc clocksource as reliable */
82 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
83
84 if (fast_calibrate)
85 return fast_calibrate;
86
87 return 0;
88}
89
90static void __init tangier_arch_setup(void)
91{
92 x86_platform.calibrate_tsc = tangier_calibrate_tsc;
93}
94
95/* tangier arch ops */
96static struct intel_mid_ops tangier_ops = {
97 .arch_setup = tangier_arch_setup,
98};
99
100void * __cpuinit get_tangier_ops()
101{
102 return &tangier_ops;
103}
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index c84c1ca396bf..994c40bd7cb7 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -224,7 +224,7 @@ int get_gpio_by_name(const char *name)
224 if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN)) 224 if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
225 return pentry->pin_no; 225 return pentry->pin_no;
226 } 226 }
227 return -1; 227 return -EINVAL;
228} 228}
229 229
230void __init intel_scu_device_register(struct platform_device *pdev) 230void __init intel_scu_device_register(struct platform_device *pdev)
@@ -250,7 +250,7 @@ static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
250 sdev->modalias); 250 sdev->modalias);
251 return; 251 return;
252 } 252 }
253 memcpy(new_dev, sdev, sizeof(*sdev)); 253 *new_dev = *sdev;
254 254
255 spi_devs[spi_next_dev++] = new_dev; 255 spi_devs[spi_next_dev++] = new_dev;
256} 256}
@@ -271,7 +271,7 @@ static void __init intel_scu_i2c_device_register(int bus,
271 idev->type); 271 idev->type);
272 return; 272 return;
273 } 273 }
274 memcpy(new_dev, idev, sizeof(*idev)); 274 *new_dev = *idev;
275 275
276 i2c_bus[i2c_next_dev] = bus; 276 i2c_bus[i2c_next_dev] = bus;
277 i2c_devs[i2c_next_dev++] = new_dev; 277 i2c_devs[i2c_next_dev++] = new_dev;
@@ -337,6 +337,8 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
337 pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", 337 pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
338 pentry->name, pentry->irq); 338 pentry->name, pentry->irq);
339 pdata = intel_mid_sfi_get_pdata(dev, pentry); 339 pdata = intel_mid_sfi_get_pdata(dev, pentry);
340 if (IS_ERR(pdata))
341 return;
340 342
341 pdev = platform_device_alloc(pentry->name, 0); 343 pdev = platform_device_alloc(pentry->name, 0);
342 if (pdev == NULL) { 344 if (pdev == NULL) {
@@ -370,6 +372,8 @@ static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
370 spi_info.chip_select); 372 spi_info.chip_select);
371 373
372 pdata = intel_mid_sfi_get_pdata(dev, &spi_info); 374 pdata = intel_mid_sfi_get_pdata(dev, &spi_info);
375 if (IS_ERR(pdata))
376 return;
373 377
374 spi_info.platform_data = pdata; 378 spi_info.platform_data = pdata;
375 if (dev->delay) 379 if (dev->delay)
@@ -395,6 +399,8 @@ static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry,
395 i2c_info.addr); 399 i2c_info.addr);
396 pdata = intel_mid_sfi_get_pdata(dev, &i2c_info); 400 pdata = intel_mid_sfi_get_pdata(dev, &i2c_info);
397 i2c_info.platform_data = pdata; 401 i2c_info.platform_data = pdata;
402 if (IS_ERR(pdata))
403 return;
398 404
399 if (dev->delay) 405 if (dev->delay)
400 intel_scu_i2c_device_register(pentry->host_num, &i2c_info); 406 intel_scu_i2c_device_register(pentry->host_num, &i2c_info);
@@ -443,13 +449,35 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
443 * so we have to enable them one by one here 449 * so we have to enable them one by one here
444 */ 450 */
445 ioapic = mp_find_ioapic(irq); 451 ioapic = mp_find_ioapic(irq);
446 irq_attr.ioapic = ioapic; 452 if (ioapic >= 0) {
447 irq_attr.ioapic_pin = irq; 453 irq_attr.ioapic = ioapic;
448 irq_attr.trigger = 1; 454 irq_attr.ioapic_pin = irq;
449 irq_attr.polarity = 1; 455 irq_attr.trigger = 1;
450 io_apic_set_pci_routing(NULL, irq, &irq_attr); 456 if (intel_mid_identify_cpu() ==
451 } else 457 INTEL_MID_CPU_CHIP_TANGIER) {
458 if (!strncmp(pentry->name,
459 "r69001-ts-i2c", 13))
460 /* active low */
461 irq_attr.polarity = 1;
462 else if (!strncmp(pentry->name,
463 "synaptics_3202", 14))
464 /* active low */
465 irq_attr.polarity = 1;
466 else if (irq == 41)
467 /* fast_int_1 */
468 irq_attr.polarity = 1;
469 else
470 /* active high */
471 irq_attr.polarity = 0;
472 } else {
473 /* PNW and CLV go with active low */
474 irq_attr.polarity = 1;
475 }
476 io_apic_set_pci_routing(NULL, irq, &irq_attr);
477 }
478 } else {
452 irq = 0; /* No irq */ 479 irq = 0; /* No irq */
480 }
453 481
454 dev = get_device_id(pentry->type, pentry->name); 482 dev = get_device_id(pentry->type, pentry->name);
455 483
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index e6cb80f620af..4d171e8640ef 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -27,7 +27,6 @@
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/errno.h> 28#include <linux/errno.h>
29#include <linux/delay.h> 29#include <linux/delay.h>
30#include <linux/init.h>
31#include <linux/pm.h> 30#include <linux/pm.h>
32#include <asm/io.h> 31#include <asm/io.h>
33 32
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index efe4d7220397..dfe605ac1bcd 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
433 return; 433 return;
434} 434}
435 435
436static inline unsigned long cycles_2_us(unsigned long long cyc) 436/*
437 * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
438 * number, not an absolute. It converts a duration in cycles to a duration in
439 * ns.
440 */
441static inline unsigned long long cycles_2_ns(unsigned long long cyc)
437{ 442{
443 struct cyc2ns_data *data = cyc2ns_read_begin();
438 unsigned long long ns; 444 unsigned long long ns;
439 unsigned long us;
440 int cpu = smp_processor_id();
441 445
442 ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; 446 ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
443 us = ns / 1000; 447
444 return us; 448 cyc2ns_read_end(data);
449 return ns;
450}
451
452/*
453 * The reverse of the above; converts a duration in ns to a duration in cycles.
454 */
455static inline unsigned long long ns_2_cycles(unsigned long long ns)
456{
457 struct cyc2ns_data *data = cyc2ns_read_begin();
458 unsigned long long cyc;
459
460 cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
461
462 cyc2ns_read_end(data);
463 return cyc;
464}
465
466static inline unsigned long cycles_2_us(unsigned long long cyc)
467{
468 return cycles_2_ns(cyc) / NSEC_PER_USEC;
469}
470
471static inline cycles_t sec_2_cycles(unsigned long sec)
472{
473 return ns_2_cycles(sec * NSEC_PER_SEC);
474}
475
476static inline unsigned long long usec_2_cycles(unsigned long usec)
477{
478 return ns_2_cycles(usec * NSEC_PER_USEC);
445} 479}
446 480
447/* 481/*
@@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc,
668 bcp, try); 702 bcp, try);
669} 703}
670 704
671static inline cycles_t sec_2_cycles(unsigned long sec)
672{
673 unsigned long ns;
674 cycles_t cyc;
675
676 ns = sec * 1000000000;
677 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
678 return cyc;
679}
680
681/* 705/*
682 * Our retries are blocked by all destination sw ack resources being 706 * Our retries are blocked by all destination sw ack resources being
683 * in use, and a timeout is pending. In that case hardware immediately 707 * in use, and a timeout is pending. In that case hardware immediately
@@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data)
1327{ 1351{
1328} 1352}
1329 1353
1330static inline unsigned long long usec_2_cycles(unsigned long microsec)
1331{
1332 unsigned long ns;
1333 unsigned long long cyc;
1334
1335 ns = microsec * 1000;
1336 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
1337 return cyc;
1338}
1339
1340/* 1354/*
1341 * Display the statistics thru /proc/sgi_uv/ptc_statistics 1355 * Display the statistics thru /proc/sgi_uv/ptc_statistics
1342 * 'data' points to the cpu number 1356 * 'data' points to the cpu number
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index a44f457e70a1..bad628a620c4 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -29,12 +29,10 @@ void __init reserve_real_mode(void)
29void __init setup_real_mode(void) 29void __init setup_real_mode(void)
30{ 30{
31 u16 real_mode_seg; 31 u16 real_mode_seg;
32 u32 *rel; 32 const u32 *rel;
33 u32 count; 33 u32 count;
34 u32 *ptr;
35 u16 *seg;
36 int i;
37 unsigned char *base; 34 unsigned char *base;
35 unsigned long phys_base;
38 struct trampoline_header *trampoline_header; 36 struct trampoline_header *trampoline_header;
39 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); 37 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
40#ifdef CONFIG_X86_64 38#ifdef CONFIG_X86_64
@@ -46,23 +44,23 @@ void __init setup_real_mode(void)
46 44
47 memcpy(base, real_mode_blob, size); 45 memcpy(base, real_mode_blob, size);
48 46
49 real_mode_seg = __pa(base) >> 4; 47 phys_base = __pa(base);
48 real_mode_seg = phys_base >> 4;
49
50 rel = (u32 *) real_mode_relocs; 50 rel = (u32 *) real_mode_relocs;
51 51
52 /* 16-bit segment relocations. */ 52 /* 16-bit segment relocations. */
53 count = rel[0]; 53 count = *rel++;
54 rel = &rel[1]; 54 while (count--) {
55 for (i = 0; i < count; i++) { 55 u16 *seg = (u16 *) (base + *rel++);
56 seg = (u16 *) (base + rel[i]);
57 *seg = real_mode_seg; 56 *seg = real_mode_seg;
58 } 57 }
59 58
60 /* 32-bit linear relocations. */ 59 /* 32-bit linear relocations. */
61 count = rel[i]; 60 count = *rel++;
62 rel = &rel[i + 1]; 61 while (count--) {
63 for (i = 0; i < count; i++) { 62 u32 *ptr = (u32 *) (base + *rel++);
64 ptr = (u32 *) (base + rel[i]); 63 *ptr += phys_base;
65 *ptr += __pa(base);
66 } 64 }
67 65
68 /* Must be perfomed *after* relocation. */ 66 /* Must be perfomed *after* relocation. */
diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S
index f932ea61d1c8..d66c607bdc58 100644
--- a/arch/x86/realmode/rm/reboot.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -1,5 +1,4 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <linux/init.h>
3#include <asm/segment.h> 2#include <asm/segment.h>
4#include <asm/page_types.h> 3#include <asm/page_types.h>
5#include <asm/processor-flags.h> 4#include <asm/processor-flags.h>
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index c1b2791183e7..48ddd76bc4c3 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -20,7 +20,6 @@
20 */ 20 */
21 21
22#include <linux/linkage.h> 22#include <linux/linkage.h>
23#include <linux/init.h>
24#include <asm/segment.h> 23#include <asm/segment.h>
25#include <asm/page_types.h> 24#include <asm/page_types.h>
26#include "realmode.h" 25#include "realmode.h"
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index bb360dc39d21..dac7b20d2f9d 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -25,7 +25,6 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/init.h>
29#include <asm/pgtable_types.h> 28#include <asm/pgtable_types.h>
30#include <asm/page_types.h> 29#include <asm/page_types.h>
31#include <asm/msr.h> 30#include <asm/msr.h>
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index aabfb8380a1c..96bc506ac6de 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,5 @@
357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
358349 i386 kcmp sys_kcmp 358349 i386 kcmp sys_kcmp
359350 i386 finit_module sys_finit_module 359350 i386 finit_module sys_finit_module
360351 i386 sched_setattr sys_sched_setattr
361352 i386 sched_getattr sys_sched_getattr
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65dfd14f..a12bddc7ccea 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,8 @@
320311 64 process_vm_writev sys_process_vm_writev 320311 64 process_vm_writev sys_process_vm_writev
321312 common kcmp sys_kcmp 321312 common kcmp sys_kcmp
322313 common finit_module sys_finit_module 322313 common finit_module sys_finit_module
323314 common sched_setattr sys_sched_setattr
324315 common sched_getattr sys_sched_getattr
323 325
324# 326#
325# x32-specific system call numbers start at 512 to avoid cache impact 327# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index f7bab68a4b83..11f9285a2ff6 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -722,15 +722,25 @@ static void percpu_init(void)
722 722
723/* 723/*
724 * Check to see if a symbol lies in the .data..percpu section. 724 * Check to see if a symbol lies in the .data..percpu section.
725 * For some as yet not understood reason the "__init_begin" 725 *
726 * symbol which immediately preceeds the .data..percpu section 726 * The linker incorrectly associates some symbols with the
727 * also shows up as it it were part of it so we do an explict 727 * .data..percpu section so we also need to check the symbol
728 * check for that symbol name and ignore it. 728 * name to make sure that we classify the symbol correctly.
729 *
730 * The GNU linker incorrectly associates:
731 * __init_begin
732 * __per_cpu_load
733 *
734 * The "gold" linker incorrectly associates:
735 * init_per_cpu__irq_stack_union
736 * init_per_cpu__gdt_page
729 */ 737 */
730static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) 738static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
731{ 739{
732 return (sym->st_shndx == per_cpu_shndx) && 740 return (sym->st_shndx == per_cpu_shndx) &&
733 strcmp(symname, "__init_begin"); 741 strcmp(symname, "__init_begin") &&
742 strcmp(symname, "__per_cpu_load") &&
743 strncmp(symname, "init_per_cpu_", 13);
734} 744}
735 745
736 746
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 2ada505067cc..eb5d7a56f8d4 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -178,7 +178,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
178 178
179 ts->tv_nsec = 0; 179 ts->tv_nsec = 0;
180 do { 180 do {
181 seq = read_seqcount_begin_no_lockdep(&gtod->seq); 181 seq = raw_read_seqcount_begin(&gtod->seq);
182 mode = gtod->clock.vclock_mode; 182 mode = gtod->clock.vclock_mode;
183 ts->tv_sec = gtod->wall_time_sec; 183 ts->tv_sec = gtod->wall_time_sec;
184 ns = gtod->wall_time_snsec; 184 ns = gtod->wall_time_snsec;
@@ -198,7 +198,7 @@ notrace static int do_monotonic(struct timespec *ts)
198 198
199 ts->tv_nsec = 0; 199 ts->tv_nsec = 0;
200 do { 200 do {
201 seq = read_seqcount_begin_no_lockdep(&gtod->seq); 201 seq = raw_read_seqcount_begin(&gtod->seq);
202 mode = gtod->clock.vclock_mode; 202 mode = gtod->clock.vclock_mode;
203 ts->tv_sec = gtod->monotonic_time_sec; 203 ts->tv_sec = gtod->monotonic_time_sec;
204 ns = gtod->monotonic_time_snsec; 204 ns = gtod->monotonic_time_snsec;
@@ -214,7 +214,7 @@ notrace static int do_realtime_coarse(struct timespec *ts)
214{ 214{
215 unsigned long seq; 215 unsigned long seq;
216 do { 216 do {
217 seq = read_seqcount_begin_no_lockdep(&gtod->seq); 217 seq = raw_read_seqcount_begin(&gtod->seq);
218 ts->tv_sec = gtod->wall_time_coarse.tv_sec; 218 ts->tv_sec = gtod->wall_time_coarse.tv_sec;
219 ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; 219 ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
220 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 220 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -225,7 +225,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts)
225{ 225{
226 unsigned long seq; 226 unsigned long seq;
227 do { 227 do {
228 seq = read_seqcount_begin_no_lockdep(&gtod->seq); 228 seq = raw_read_seqcount_begin(&gtod->seq);
229 ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; 229 ts->tv_sec = gtod->monotonic_time_coarse.tv_sec;
230 ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; 230 ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
231 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 231 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 01f5e3b4613c..1e13eb8c9656 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,6 +1,5 @@
1#include <asm/page_types.h> 1#include <asm/page_types.h>
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <linux/init.h>
4 3
5__PAGE_ALIGNED_DATA 4__PAGE_ALIGNED_DATA
6 5
diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S
index d6b9a7f42a8a..295f1c7543d8 100644
--- a/arch/x86/vdso/vdsox32.S
+++ b/arch/x86/vdso/vdsox32.S
@@ -1,6 +1,5 @@
1#include <asm/page_types.h> 1#include <asm/page_types.h>
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <linux/init.h>
4 3
5__PAGE_ALIGNED_DATA 4__PAGE_ALIGNED_DATA
6 5
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1a3c76505649..01b90261fa38 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -51,3 +51,7 @@ config XEN_DEBUG_FS
51 Enable statistics output and various tuning options in debugfs. 51 Enable statistics output and various tuning options in debugfs.
52 Enabling this option may incur a significant performance overhead. 52 Enabling this option may incur a significant performance overhead.
53 53
54config XEN_PVH
55 bool "Support for running as a PVH guest"
56 depends on X86_64 && XEN && XEN_PVHVM
57 def_bool n
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fa6ade76ef3f..a4d7b647867f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -262,8 +262,9 @@ static void __init xen_banner(void)
262 struct xen_extraversion extra; 262 struct xen_extraversion extra;
263 HYPERVISOR_xen_version(XENVER_extraversion, &extra); 263 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
264 264
265 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 265 pr_info("Booting paravirtualized kernel %son %s\n",
266 pv_info.name); 266 xen_feature(XENFEAT_auto_translated_physmap) ?
267 "with PVH extensions " : "", pv_info.name);
267 printk(KERN_INFO "Xen version: %d.%d%s%s\n", 268 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
268 version >> 16, version & 0xffff, extra.extraversion, 269 version >> 16, version & 0xffff, extra.extraversion,
269 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 270 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
@@ -433,7 +434,7 @@ static void __init xen_init_cpuid_mask(void)
433 434
434 ax = 1; 435 ax = 1;
435 cx = 0; 436 cx = 0;
436 xen_cpuid(&ax, &bx, &cx, &dx); 437 cpuid(1, &ax, &bx, &cx, &dx);
437 438
438 xsave_mask = 439 xsave_mask =
439 (1 << (X86_FEATURE_XSAVE % 32)) | 440 (1 << (X86_FEATURE_XSAVE % 32)) |
@@ -1142,8 +1143,9 @@ void xen_setup_vcpu_info_placement(void)
1142 xen_vcpu_setup(cpu); 1143 xen_vcpu_setup(cpu);
1143 1144
1144 /* xen_vcpu_setup managed to place the vcpu_info within the 1145 /* xen_vcpu_setup managed to place the vcpu_info within the
1145 percpu area for all cpus, so make use of it */ 1146 * percpu area for all cpus, so make use of it. Note that for
1146 if (have_vcpu_info_placement) { 1147 * PVH we want to use native IRQ mechanism. */
1148 if (have_vcpu_info_placement && !xen_pvh_domain()) {
1147 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 1149 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1148 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 1150 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1149 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 1151 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1407,9 +1409,49 @@ static void __init xen_boot_params_init_edd(void)
1407 * Set up the GDT and segment registers for -fstack-protector. Until 1409 * Set up the GDT and segment registers for -fstack-protector. Until
1408 * we do this, we have to be careful not to call any stack-protected 1410 * we do this, we have to be careful not to call any stack-protected
1409 * function, which is most of the kernel. 1411 * function, which is most of the kernel.
1412 *
1413 * Note, that it is __ref because the only caller of this after init
1414 * is PVH which is not going to use xen_load_gdt_boot or other
1415 * __init functions.
1410 */ 1416 */
1411static void __init xen_setup_stackprotector(void) 1417static void __ref xen_setup_gdt(int cpu)
1412{ 1418{
1419 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1420#ifdef CONFIG_X86_64
1421 unsigned long dummy;
1422
1423 load_percpu_segment(cpu); /* We need to access per-cpu area */
1424 switch_to_new_gdt(cpu); /* GDT and GS set */
1425
1426 /* We are switching of the Xen provided GDT to our HVM mode
1427 * GDT. The new GDT has __KERNEL_CS with CS.L = 1
1428 * and we are jumping to reload it.
1429 */
1430 asm volatile ("pushq %0\n"
1431 "leaq 1f(%%rip),%0\n"
1432 "pushq %0\n"
1433 "lretq\n"
1434 "1:\n"
1435 : "=&r" (dummy) : "0" (__KERNEL_CS));
1436
1437 /*
1438 * While not needed, we also set the %es, %ds, and %fs
1439 * to zero. We don't care about %ss as it is NULL.
1440 * Strictly speaking this is not needed as Xen zeros those
1441 * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
1442 *
1443 * Linux zeros them in cpu_init() and in secondary_startup_64
1444 * (for BSP).
1445 */
1446 loadsegment(es, 0);
1447 loadsegment(ds, 0);
1448 loadsegment(fs, 0);
1449#else
1450 /* PVH: TODO Implement. */
1451 BUG();
1452#endif
1453 return; /* PVH does not need any PV GDT ops. */
1454 }
1413 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; 1455 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
1414 pv_cpu_ops.load_gdt = xen_load_gdt_boot; 1456 pv_cpu_ops.load_gdt = xen_load_gdt_boot;
1415 1457
@@ -1420,6 +1462,46 @@ static void __init xen_setup_stackprotector(void)
1420 pv_cpu_ops.load_gdt = xen_load_gdt; 1462 pv_cpu_ops.load_gdt = xen_load_gdt;
1421} 1463}
1422 1464
1465/*
1466 * A PV guest starts with default flags that are not set for PVH, set them
1467 * here asap.
1468 */
1469static void xen_pvh_set_cr_flags(int cpu)
1470{
1471
1472 /* Some of these are setup in 'secondary_startup_64'. The others:
1473 * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
1474 * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
1475 write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
1476}
1477
1478/*
1479 * Note, that it is ref - because the only caller of this after init
1480 * is PVH which is not going to use xen_load_gdt_boot or other
1481 * __init functions.
1482 */
1483void __ref xen_pvh_secondary_vcpu_init(int cpu)
1484{
1485 xen_setup_gdt(cpu);
1486 xen_pvh_set_cr_flags(cpu);
1487}
1488
1489static void __init xen_pvh_early_guest_init(void)
1490{
1491 if (!xen_feature(XENFEAT_auto_translated_physmap))
1492 return;
1493
1494 if (!xen_feature(XENFEAT_hvm_callback_vector))
1495 return;
1496
1497 xen_have_vector_callback = 1;
1498 xen_pvh_set_cr_flags(0);
1499
1500#ifdef CONFIG_X86_32
1501 BUG(); /* PVH: Implement proper support. */
1502#endif
1503}
1504
1423/* First C function to be called on Xen boot */ 1505/* First C function to be called on Xen boot */
1424asmlinkage void __init xen_start_kernel(void) 1506asmlinkage void __init xen_start_kernel(void)
1425{ 1507{
@@ -1431,13 +1513,16 @@ asmlinkage void __init xen_start_kernel(void)
1431 1513
1432 xen_domain_type = XEN_PV_DOMAIN; 1514 xen_domain_type = XEN_PV_DOMAIN;
1433 1515
1516 xen_setup_features();
1517 xen_pvh_early_guest_init();
1434 xen_setup_machphys_mapping(); 1518 xen_setup_machphys_mapping();
1435 1519
1436 /* Install Xen paravirt ops */ 1520 /* Install Xen paravirt ops */
1437 pv_info = xen_info; 1521 pv_info = xen_info;
1438 pv_init_ops = xen_init_ops; 1522 pv_init_ops = xen_init_ops;
1439 pv_cpu_ops = xen_cpu_ops;
1440 pv_apic_ops = xen_apic_ops; 1523 pv_apic_ops = xen_apic_ops;
1524 if (!xen_pvh_domain())
1525 pv_cpu_ops = xen_cpu_ops;
1441 1526
1442 x86_init.resources.memory_setup = xen_memory_setup; 1527 x86_init.resources.memory_setup = xen_memory_setup;
1443 x86_init.oem.arch_setup = xen_arch_setup; 1528 x86_init.oem.arch_setup = xen_arch_setup;
@@ -1469,17 +1554,14 @@ asmlinkage void __init xen_start_kernel(void)
1469 /* Work out if we support NX */ 1554 /* Work out if we support NX */
1470 x86_configure_nx(); 1555 x86_configure_nx();
1471 1556
1472 xen_setup_features();
1473
1474 /* Get mfn list */ 1557 /* Get mfn list */
1475 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1558 xen_build_dynamic_phys_to_machine();
1476 xen_build_dynamic_phys_to_machine();
1477 1559
1478 /* 1560 /*
1479 * Set up kernel GDT and segment registers, mainly so that 1561 * Set up kernel GDT and segment registers, mainly so that
1480 * -fstack-protector code can be executed. 1562 * -fstack-protector code can be executed.
1481 */ 1563 */
1482 xen_setup_stackprotector(); 1564 xen_setup_gdt(0);
1483 1565
1484 xen_init_irq_ops(); 1566 xen_init_irq_ops();
1485 xen_init_cpuid_mask(); 1567 xen_init_cpuid_mask();
@@ -1548,14 +1630,18 @@ asmlinkage void __init xen_start_kernel(void)
1548 /* set the limit of our address space */ 1630 /* set the limit of our address space */
1549 xen_reserve_top(); 1631 xen_reserve_top();
1550 1632
1551 /* We used to do this in xen_arch_setup, but that is too late on AMD 1633 /* PVH: runs at default kernel iopl of 0 */
1552 * were early_cpu_init (run before ->arch_setup()) calls early_amd_init 1634 if (!xen_pvh_domain()) {
1553 * which pokes 0xcf8 port. 1635 /*
1554 */ 1636 * We used to do this in xen_arch_setup, but that is too late
1555 set_iopl.iopl = 1; 1637 * on AMD were early_cpu_init (run before ->arch_setup()) calls
1556 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1638 * early_amd_init which pokes 0xcf8 port.
1557 if (rc != 0) 1639 */
1558 xen_raw_printk("physdev_op failed %d\n", rc); 1640 set_iopl.iopl = 1;
1641 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1642 if (rc != 0)
1643 xen_raw_printk("physdev_op failed %d\n", rc);
1644 }
1559 1645
1560#ifdef CONFIG_X86_32 1646#ifdef CONFIG_X86_32
1561 /* set up basic CPUID stuff */ 1647 /* set up basic CPUID stuff */
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 3a5f55d51907..103c93f874b2 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -125,3 +125,66 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
125 apply_to_page_range(&init_mm, (unsigned long)shared, 125 apply_to_page_range(&init_mm, (unsigned long)shared,
126 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); 126 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
127} 127}
128#ifdef CONFIG_XEN_PVH
129#include <xen/balloon.h>
130#include <xen/events.h>
131#include <xen/xen.h>
132#include <linux/slab.h>
133static int __init xlated_setup_gnttab_pages(void)
134{
135 struct page **pages;
136 xen_pfn_t *pfns;
137 int rc;
138 unsigned int i;
139 unsigned long nr_grant_frames = gnttab_max_grant_frames();
140
141 BUG_ON(nr_grant_frames == 0);
142 pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL);
143 if (!pages)
144 return -ENOMEM;
145
146 pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
147 if (!pfns) {
148 kfree(pages);
149 return -ENOMEM;
150 }
151 rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
152 if (rc) {
153 pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
154 nr_grant_frames, rc);
155 kfree(pages);
156 kfree(pfns);
157 return rc;
158 }
159 for (i = 0; i < nr_grant_frames; i++)
160 pfns[i] = page_to_pfn(pages[i]);
161
162 rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
163 &xen_auto_xlat_grant_frames.vaddr);
164
165 kfree(pages);
166 if (rc) {
167 pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
168 nr_grant_frames, rc);
169 free_xenballooned_pages(nr_grant_frames, pages);
170 kfree(pfns);
171 return rc;
172 }
173
174 xen_auto_xlat_grant_frames.pfn = pfns;
175 xen_auto_xlat_grant_frames.count = nr_grant_frames;
176
177 return 0;
178}
179
180static int __init xen_pvh_gnttab_setup(void)
181{
182 if (!xen_pvh_domain())
183 return -ENODEV;
184
185 return xlated_setup_gnttab_pages();
186}
187/* Call it _before_ __gnttab_init as we need to initialize the
188 * xen_auto_xlat_grant_frames first. */
189core_initcall(xen_pvh_gnttab_setup);
190#endif
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 0da7f863056f..76ca326105f7 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -5,6 +5,7 @@
5#include <xen/interface/xen.h> 5#include <xen/interface/xen.h>
6#include <xen/interface/sched.h> 6#include <xen/interface/sched.h>
7#include <xen/interface/vcpu.h> 7#include <xen/interface/vcpu.h>
8#include <xen/features.h>
8#include <xen/events.h> 9#include <xen/events.h>
9 10
10#include <asm/xen/hypercall.h> 11#include <asm/xen/hypercall.h>
@@ -128,6 +129,8 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
128 129
129void __init xen_init_irq_ops(void) 130void __init xen_init_irq_ops(void)
130{ 131{
131 pv_irq_ops = xen_irq_ops; 132 /* For PVH we use default pv_irq_ops settings. */
133 if (!xen_feature(XENFEAT_hvm_callback_vector))
134 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ; 135 x86_init.irqs.intr_init = xen_init_IRQ;
133} 136}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ce563be09cc1..c1d406f35523 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1198,44 +1198,40 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
1198 * instead of somewhere later and be confusing. */ 1198 * instead of somewhere later and be confusing. */
1199 xen_mc_flush(); 1199 xen_mc_flush();
1200} 1200}
1201#endif 1201static void __init xen_pagetable_p2m_copy(void)
1202static void __init xen_pagetable_init(void)
1203{ 1202{
1204#ifdef CONFIG_X86_64
1205 unsigned long size; 1203 unsigned long size;
1206 unsigned long addr; 1204 unsigned long addr;
1207#endif 1205 unsigned long new_mfn_list;
1208 paging_init(); 1206
1209 xen_setup_shared_info(); 1207 if (xen_feature(XENFEAT_auto_translated_physmap))
1210#ifdef CONFIG_X86_64 1208 return;
1211 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1209
1212 unsigned long new_mfn_list; 1210 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1213 1211
1214 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1212 new_mfn_list = xen_revector_p2m_tree();
1215 1213 /* No memory or already called. */
1216 /* On 32-bit, we get zero so this never gets executed. */ 1214 if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
1217 new_mfn_list = xen_revector_p2m_tree(); 1215 return;
1218 if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { 1216
1219 /* using __ka address and sticking INVALID_P2M_ENTRY! */ 1217 /* using __ka address and sticking INVALID_P2M_ENTRY! */
1220 memset((void *)xen_start_info->mfn_list, 0xff, size); 1218 memset((void *)xen_start_info->mfn_list, 0xff, size);
1221 1219
1222 /* We should be in __ka space. */ 1220 /* We should be in __ka space. */
1223 BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); 1221 BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
1224 addr = xen_start_info->mfn_list; 1222 addr = xen_start_info->mfn_list;
1225 /* We roundup to the PMD, which means that if anybody at this stage is 1223 /* We roundup to the PMD, which means that if anybody at this stage is
1226 * using the __ka address of xen_start_info or xen_start_info->shared_info 1224 * using the __ka address of xen_start_info or xen_start_info->shared_info
1227 * they are in going to crash. Fortunatly we have already revectored 1225 * they are in going to crash. Fortunatly we have already revectored
1228 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ 1226 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
1229 size = roundup(size, PMD_SIZE); 1227 size = roundup(size, PMD_SIZE);
1230 xen_cleanhighmap(addr, addr + size); 1228 xen_cleanhighmap(addr, addr + size);
1231 1229
1232 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1230 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1233 memblock_free(__pa(xen_start_info->mfn_list), size); 1231 memblock_free(__pa(xen_start_info->mfn_list), size);
1234 /* And revector! Bye bye old array */ 1232 /* And revector! Bye bye old array */
1235 xen_start_info->mfn_list = new_mfn_list; 1233 xen_start_info->mfn_list = new_mfn_list;
1236 } else 1234
1237 goto skip;
1238 }
1239 /* At this stage, cleanup_highmap has already cleaned __ka space 1235 /* At this stage, cleanup_highmap has already cleaned __ka space
1240 * from _brk_limit way up to the max_pfn_mapped (which is the end of 1236 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1241 * the ramdisk). We continue on, erasing PMD entries that point to page 1237 * the ramdisk). We continue on, erasing PMD entries that point to page
@@ -1255,7 +1251,15 @@ static void __init xen_pagetable_init(void)
1255 * anything at this stage. */ 1251 * anything at this stage. */
1256 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); 1252 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1257#endif 1253#endif
1258skip: 1254}
1255#endif
1256
1257static void __init xen_pagetable_init(void)
1258{
1259 paging_init();
1260 xen_setup_shared_info();
1261#ifdef CONFIG_X86_64
1262 xen_pagetable_p2m_copy();
1259#endif 1263#endif
1260 xen_post_allocator_init(); 1264 xen_post_allocator_init();
1261} 1265}
@@ -1753,6 +1757,10 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
1753 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1757 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1754 pte_t pte = pfn_pte(pfn, prot); 1758 pte_t pte = pfn_pte(pfn, prot);
1755 1759
1760 /* For PVH no need to set R/O or R/W to pin them or unpin them. */
1761 if (xen_feature(XENFEAT_auto_translated_physmap))
1762 return;
1763
1756 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) 1764 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1757 BUG(); 1765 BUG();
1758} 1766}
@@ -1863,6 +1871,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1863 * but that's enough to get __va working. We need to fill in the rest 1871 * but that's enough to get __va working. We need to fill in the rest
1864 * of the physical mapping once some sort of allocator has been set 1872 * of the physical mapping once some sort of allocator has been set
1865 * up. 1873 * up.
1874 * NOTE: for PVH, the page tables are native.
1866 */ 1875 */
1867void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1876void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1868{ 1877{
@@ -1884,17 +1893,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1884 /* Zap identity mapping */ 1893 /* Zap identity mapping */
1885 init_level4_pgt[0] = __pgd(0); 1894 init_level4_pgt[0] = __pgd(0);
1886 1895
1887 /* Pre-constructed entries are in pfn, so convert to mfn */ 1896 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1888 /* L4[272] -> level3_ident_pgt 1897 /* Pre-constructed entries are in pfn, so convert to mfn */
1889 * L4[511] -> level3_kernel_pgt */ 1898 /* L4[272] -> level3_ident_pgt
1890 convert_pfn_mfn(init_level4_pgt); 1899 * L4[511] -> level3_kernel_pgt */
1891 1900 convert_pfn_mfn(init_level4_pgt);
1892 /* L3_i[0] -> level2_ident_pgt */ 1901
1893 convert_pfn_mfn(level3_ident_pgt); 1902 /* L3_i[0] -> level2_ident_pgt */
1894 /* L3_k[510] -> level2_kernel_pgt 1903 convert_pfn_mfn(level3_ident_pgt);
1895 * L3_i[511] -> level2_fixmap_pgt */ 1904 /* L3_k[510] -> level2_kernel_pgt
1896 convert_pfn_mfn(level3_kernel_pgt); 1905 * L3_i[511] -> level2_fixmap_pgt */
1897 1906 convert_pfn_mfn(level3_kernel_pgt);
1907 }
1898 /* We get [511][511] and have Xen's version of level2_kernel_pgt */ 1908 /* We get [511][511] and have Xen's version of level2_kernel_pgt */
1899 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); 1909 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1900 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); 1910 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
@@ -1918,31 +1928,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1918 copy_page(level2_fixmap_pgt, l2); 1928 copy_page(level2_fixmap_pgt, l2);
1919 /* Note that we don't do anything with level1_fixmap_pgt which 1929 /* Note that we don't do anything with level1_fixmap_pgt which
1920 * we don't need. */ 1930 * we don't need. */
1931 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1932 /* Make pagetable pieces RO */
1933 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1934 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1935 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1936 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1937 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1938 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1939 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1940
1941 /* Pin down new L4 */
1942 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1943 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1944
1945 /* Unpin Xen-provided one */
1946 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1921 1947
1922 /* Make pagetable pieces RO */ 1948 /*
1923 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1949 * At this stage there can be no user pgd, and no page
1924 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 1950 * structure to attach it to, so make sure we just set kernel
1925 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 1951 * pgd.
1926 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 1952 */
1927 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); 1953 xen_mc_batch();
1928 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1954 __xen_write_cr3(true, __pa(init_level4_pgt));
1929 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); 1955 xen_mc_issue(PARAVIRT_LAZY_CPU);
1930 1956 } else
1931 /* Pin down new L4 */ 1957 native_write_cr3(__pa(init_level4_pgt));
1932 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1933 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1934
1935 /* Unpin Xen-provided one */
1936 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1937
1938 /*
1939 * At this stage there can be no user pgd, and no page
1940 * structure to attach it to, so make sure we just set kernel
1941 * pgd.
1942 */
1943 xen_mc_batch();
1944 __xen_write_cr3(true, __pa(init_level4_pgt));
1945 xen_mc_issue(PARAVIRT_LAZY_CPU);
1946 1958
1947 /* We can't that easily rip out L3 and L2, as the Xen pagetables are 1959 /* We can't that easily rip out L3 and L2, as the Xen pagetables are
1948 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for 1960 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
@@ -2103,6 +2115,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2103 2115
2104static void __init xen_post_allocator_init(void) 2116static void __init xen_post_allocator_init(void)
2105{ 2117{
2118 if (xen_feature(XENFEAT_auto_translated_physmap))
2119 return;
2120
2106 pv_mmu_ops.set_pte = xen_set_pte; 2121 pv_mmu_ops.set_pte = xen_set_pte;
2107 pv_mmu_ops.set_pmd = xen_set_pmd; 2122 pv_mmu_ops.set_pmd = xen_set_pmd;
2108 pv_mmu_ops.set_pud = xen_set_pud; 2123 pv_mmu_ops.set_pud = xen_set_pud;
@@ -2207,6 +2222,15 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2207void __init xen_init_mmu_ops(void) 2222void __init xen_init_mmu_ops(void)
2208{ 2223{
2209 x86_init.paging.pagetable_init = xen_pagetable_init; 2224 x86_init.paging.pagetable_init = xen_pagetable_init;
2225
2226 /* Optimization - we can use the HVM one but it has no idea which
2227 * VCPUs are descheduled - which means that it will needlessly IPI
2228 * them. Xen knows so let it do the job.
2229 */
2230 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2231 pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
2232 return;
2233 }
2210 pv_mmu_ops = xen_mmu_ops; 2234 pv_mmu_ops = xen_mmu_ops;
2211 2235
2212 memset(dummy_mapping, 0xff, PAGE_SIZE); 2236 memset(dummy_mapping, 0xff, PAGE_SIZE);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 2ae8699e8767..696c694986d0 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -280,6 +280,9 @@ void __ref xen_build_mfn_list_list(void)
280{ 280{
281 unsigned long pfn; 281 unsigned long pfn;
282 282
283 if (xen_feature(XENFEAT_auto_translated_physmap))
284 return;
285
283 /* Pre-initialize p2m_top_mfn to be completely missing */ 286 /* Pre-initialize p2m_top_mfn to be completely missing */
284 if (p2m_top_mfn == NULL) { 287 if (p2m_top_mfn == NULL) {
285 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); 288 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
@@ -336,6 +339,9 @@ void __ref xen_build_mfn_list_list(void)
336 339
337void xen_setup_mfn_list_list(void) 340void xen_setup_mfn_list_list(void)
338{ 341{
342 if (xen_feature(XENFEAT_auto_translated_physmap))
343 return;
344
339 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 345 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
340 346
341 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 347 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
@@ -346,10 +352,15 @@ void xen_setup_mfn_list_list(void)
346/* Set up p2m_top to point to the domain-builder provided p2m pages */ 352/* Set up p2m_top to point to the domain-builder provided p2m pages */
347void __init xen_build_dynamic_phys_to_machine(void) 353void __init xen_build_dynamic_phys_to_machine(void)
348{ 354{
349 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; 355 unsigned long *mfn_list;
350 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 356 unsigned long max_pfn;
351 unsigned long pfn; 357 unsigned long pfn;
352 358
359 if (xen_feature(XENFEAT_auto_translated_physmap))
360 return;
361
362 mfn_list = (unsigned long *)xen_start_info->mfn_list;
363 max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
353 xen_max_p2m_pfn = max_pfn; 364 xen_max_p2m_pfn = max_pfn;
354 365
355 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); 366 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0a7852483ffe..a8261716d58d 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -30,10 +30,9 @@
30#define XEN_PLATFORM_ERR_PROTOCOL -2 30#define XEN_PLATFORM_ERR_PROTOCOL -2
31#define XEN_PLATFORM_ERR_BLACKLIST -3 31#define XEN_PLATFORM_ERR_BLACKLIST -3
32 32
33/* store the value of xen_emul_unplug after the unplug is done */
34int xen_platform_pci_unplug;
35EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
36#ifdef CONFIG_XEN_PVHVM 33#ifdef CONFIG_XEN_PVHVM
34/* store the value of xen_emul_unplug after the unplug is done */
35static int xen_platform_pci_unplug;
37static int xen_emul_unplug; 36static int xen_emul_unplug;
38 37
39static int check_platform_magic(void) 38static int check_platform_magic(void)
@@ -69,6 +68,80 @@ static int check_platform_magic(void)
69 return 0; 68 return 0;
70} 69}
71 70
71bool xen_has_pv_devices()
72{
73 if (!xen_domain())
74 return false;
75
76 /* PV domains always have them. */
77 if (xen_pv_domain())
78 return true;
79
80 /* And user has xen_platform_pci=0 set in guest config as
81 * driver did not modify the value. */
82 if (xen_platform_pci_unplug == 0)
83 return false;
84
85 if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
86 return false;
87
88 if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
89 return true;
90
91 /* This is an odd one - we are going to run legacy
92 * and PV drivers at the same time. */
93 if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
94 return true;
95
96 /* And the caller has to follow with xen_pv_{disk,nic}_devices
97 * to be certain which driver can load. */
98 return false;
99}
100EXPORT_SYMBOL_GPL(xen_has_pv_devices);
101
102static bool __xen_has_pv_device(int state)
103{
104 /* HVM domains might or might not */
105 if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
106 return true;
107
108 return xen_has_pv_devices();
109}
110
111bool xen_has_pv_nic_devices(void)
112{
113 return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
114}
115EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
116
117bool xen_has_pv_disk_devices(void)
118{
119 return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
120 XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
121}
122EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
123
124/*
125 * This one is odd - it determines whether you want to run PV _and_
126 * legacy (IDE) drivers together. This combination is only possible
127 * under HVM.
128 */
129bool xen_has_pv_and_legacy_disk_devices(void)
130{
131 if (!xen_domain())
132 return false;
133
134 /* N.B. This is only ever used in HVM mode */
135 if (xen_pv_domain())
136 return false;
137
138 if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
139 return true;
140
141 return false;
142}
143EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
144
72void xen_unplug_emulated_devices(void) 145void xen_unplug_emulated_devices(void)
73{ 146{
74 int r; 147 int r;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 68c054f59de6..dd5f905e33d5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,6 +27,7 @@
27#include <xen/interface/memory.h> 27#include <xen/interface/memory.h>
28#include <xen/interface/physdev.h> 28#include <xen/interface/physdev.h>
29#include <xen/features.h> 29#include <xen/features.h>
30#include "mmu.h"
30#include "xen-ops.h" 31#include "xen-ops.h"
31#include "vdso.h" 32#include "vdso.h"
32 33
@@ -81,6 +82,9 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
81 82
82 memblock_reserve(start, size); 83 memblock_reserve(start, size);
83 84
85 if (xen_feature(XENFEAT_auto_translated_physmap))
86 return;
87
84 xen_max_p2m_pfn = PFN_DOWN(start + size); 88 xen_max_p2m_pfn = PFN_DOWN(start + size);
85 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { 89 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
86 unsigned long mfn = pfn_to_mfn(pfn); 90 unsigned long mfn = pfn_to_mfn(pfn);
@@ -103,6 +107,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
103 .domid = DOMID_SELF 107 .domid = DOMID_SELF
104 }; 108 };
105 unsigned long len = 0; 109 unsigned long len = 0;
110 int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
106 unsigned long pfn; 111 unsigned long pfn;
107 int ret; 112 int ret;
108 113
@@ -116,7 +121,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
116 continue; 121 continue;
117 frame = mfn; 122 frame = mfn;
118 } else { 123 } else {
119 if (mfn != INVALID_P2M_ENTRY) 124 if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
120 continue; 125 continue;
121 frame = pfn; 126 frame = pfn;
122 } 127 }
@@ -154,6 +159,13 @@ static unsigned long __init xen_do_chunk(unsigned long start,
154static unsigned long __init xen_release_chunk(unsigned long start, 159static unsigned long __init xen_release_chunk(unsigned long start,
155 unsigned long end) 160 unsigned long end)
156{ 161{
162 /*
163 * Xen already ballooned out the E820 non RAM regions for us
164 * and set them up properly in EPT.
165 */
166 if (xen_feature(XENFEAT_auto_translated_physmap))
167 return end - start;
168
157 return xen_do_chunk(start, end, true); 169 return xen_do_chunk(start, end, true);
158} 170}
159 171
@@ -222,7 +234,13 @@ static void __init xen_set_identity_and_release_chunk(
222 * (except for the ISA region which must be 1:1 mapped) to 234 * (except for the ISA region which must be 1:1 mapped) to
223 * release the refcounts (in Xen) on the original frames. 235 * release the refcounts (in Xen) on the original frames.
224 */ 236 */
225 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { 237
238 /*
239 * PVH E820 matches the hypervisor's P2M which means we need to
240 * account for the proper values of *release and *identity.
241 */
242 for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
243 pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
226 pte_t pte = __pte_ma(0); 244 pte_t pte = __pte_ma(0);
227 245
228 if (pfn < PFN_UP(ISA_END_ADDRESS)) 246 if (pfn < PFN_UP(ISA_END_ADDRESS))
@@ -563,16 +581,13 @@ void xen_enable_nmi(void)
563 BUG(); 581 BUG();
564#endif 582#endif
565} 583}
566void __init xen_arch_setup(void) 584void __init xen_pvmmu_arch_setup(void)
567{ 585{
568 xen_panic_handler_init();
569
570 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 586 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
571 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 587 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
572 588
573 if (!xen_feature(XENFEAT_auto_translated_physmap)) 589 HYPERVISOR_vm_assist(VMASST_CMD_enable,
574 HYPERVISOR_vm_assist(VMASST_CMD_enable, 590 VMASST_TYPE_pae_extended_cr3);
575 VMASST_TYPE_pae_extended_cr3);
576 591
577 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 592 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
578 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 593 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
@@ -581,6 +596,15 @@ void __init xen_arch_setup(void)
581 xen_enable_sysenter(); 596 xen_enable_sysenter();
582 xen_enable_syscall(); 597 xen_enable_syscall();
583 xen_enable_nmi(); 598 xen_enable_nmi();
599}
600
601/* This function is not called for HVM domains */
602void __init xen_arch_setup(void)
603{
604 xen_panic_handler_init();
605 if (!xen_feature(XENFEAT_auto_translated_physmap))
606 xen_pvmmu_arch_setup();
607
584#ifdef CONFIG_ACPI 608#ifdef CONFIG_ACPI
585 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 609 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
586 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 610 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c36b325abd83..a18eadd8bb40 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -73,9 +73,11 @@ static void cpu_bringup(void)
73 touch_softlockup_watchdog(); 73 touch_softlockup_watchdog();
74 preempt_disable(); 74 preempt_disable();
75 75
76 xen_enable_sysenter(); 76 /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
77 xen_enable_syscall(); 77 if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
78 78 xen_enable_sysenter();
79 xen_enable_syscall();
80 }
79 cpu = smp_processor_id(); 81 cpu = smp_processor_id();
80 smp_store_cpu_info(cpu); 82 smp_store_cpu_info(cpu);
81 cpu_data(cpu).x86_max_cores = 1; 83 cpu_data(cpu).x86_max_cores = 1;
@@ -97,8 +99,14 @@ static void cpu_bringup(void)
97 wmb(); /* make sure everything is out */ 99 wmb(); /* make sure everything is out */
98} 100}
99 101
100static void cpu_bringup_and_idle(void) 102/* Note: cpu parameter is only relevant for PVH */
103static void cpu_bringup_and_idle(int cpu)
101{ 104{
105#ifdef CONFIG_X86_64
106 if (xen_feature(XENFEAT_auto_translated_physmap) &&
107 xen_feature(XENFEAT_supervisor_mode_kernel))
108 xen_pvh_secondary_vcpu_init(cpu);
109#endif
102 cpu_bringup(); 110 cpu_bringup();
103 cpu_startup_entry(CPUHP_ONLINE); 111 cpu_startup_entry(CPUHP_ONLINE);
104} 112}
@@ -274,9 +282,10 @@ static void __init xen_smp_prepare_boot_cpu(void)
274 native_smp_prepare_boot_cpu(); 282 native_smp_prepare_boot_cpu();
275 283
276 if (xen_pv_domain()) { 284 if (xen_pv_domain()) {
277 /* We've switched to the "real" per-cpu gdt, so make sure the 285 if (!xen_feature(XENFEAT_writable_page_tables))
278 old memory can be recycled */ 286 /* We've switched to the "real" per-cpu gdt, so make
279 make_lowmem_page_readwrite(xen_initial_gdt); 287 * sure the old memory can be recycled. */
288 make_lowmem_page_readwrite(xen_initial_gdt);
280 289
281#ifdef CONFIG_X86_32 290#ifdef CONFIG_X86_32
282 /* 291 /*
@@ -360,22 +369,21 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
360 369
361 gdt = get_cpu_gdt_table(cpu); 370 gdt = get_cpu_gdt_table(cpu);
362 371
363 ctxt->flags = VGCF_IN_KERNEL;
364 ctxt->user_regs.ss = __KERNEL_DS;
365#ifdef CONFIG_X86_32 372#ifdef CONFIG_X86_32
373 /* Note: PVH is not yet supported on x86_32. */
366 ctxt->user_regs.fs = __KERNEL_PERCPU; 374 ctxt->user_regs.fs = __KERNEL_PERCPU;
367 ctxt->user_regs.gs = __KERNEL_STACK_CANARY; 375 ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
368#else
369 ctxt->gs_base_kernel = per_cpu_offset(cpu);
370#endif 376#endif
371 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 377 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
372 378
373 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); 379 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
374 380
375 { 381 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
382 ctxt->flags = VGCF_IN_KERNEL;
376 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 383 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
377 ctxt->user_regs.ds = __USER_DS; 384 ctxt->user_regs.ds = __USER_DS;
378 ctxt->user_regs.es = __USER_DS; 385 ctxt->user_regs.es = __USER_DS;
386 ctxt->user_regs.ss = __KERNEL_DS;
379 387
380 xen_copy_trap_info(ctxt->trap_ctxt); 388 xen_copy_trap_info(ctxt->trap_ctxt);
381 389
@@ -396,18 +404,27 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
396#ifdef CONFIG_X86_32 404#ifdef CONFIG_X86_32
397 ctxt->event_callback_cs = __KERNEL_CS; 405 ctxt->event_callback_cs = __KERNEL_CS;
398 ctxt->failsafe_callback_cs = __KERNEL_CS; 406 ctxt->failsafe_callback_cs = __KERNEL_CS;
407#else
408 ctxt->gs_base_kernel = per_cpu_offset(cpu);
399#endif 409#endif
400 ctxt->event_callback_eip = 410 ctxt->event_callback_eip =
401 (unsigned long)xen_hypervisor_callback; 411 (unsigned long)xen_hypervisor_callback;
402 ctxt->failsafe_callback_eip = 412 ctxt->failsafe_callback_eip =
403 (unsigned long)xen_failsafe_callback; 413 (unsigned long)xen_failsafe_callback;
414 ctxt->user_regs.cs = __KERNEL_CS;
415 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
416#ifdef CONFIG_X86_32
404 } 417 }
405 ctxt->user_regs.cs = __KERNEL_CS; 418#else
419 } else
420 /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
421 * %rdi having the cpu number - which means are passing in
422 * as the first parameter the cpu. Subtle!
423 */
424 ctxt->user_regs.rdi = cpu;
425#endif
406 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 426 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
407
408 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
409 ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); 427 ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
410
411 if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) 428 if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
412 BUG(); 429 BUG();
413 430
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 12a1ca707b94..7b78f88c1707 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -446,6 +446,7 @@ void xen_setup_timer(int cpu)
446 IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| 446 IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
447 IRQF_FORCE_RESUME, 447 IRQF_FORCE_RESUME,
448 name, NULL); 448 name, NULL);
449 (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
449 450
450 memcpy(evt, xen_clockevent, sizeof(*evt)); 451 memcpy(evt, xen_clockevent, sizeof(*evt));
451 452
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7faed5869e5b..485b69585540 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -11,8 +11,28 @@
11#include <asm/page_types.h> 11#include <asm/page_types.h>
12 12
13#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <xen/interface/features.h>
14#include <asm/xen/interface.h> 15#include <asm/xen/interface.h>
15 16
17#ifdef CONFIG_XEN_PVH
18#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
19/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
20 * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
21 * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
22 */
23#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
24 (1 << XENFEAT_auto_translated_physmap) | \
25 (1 << XENFEAT_supervisor_mode_kernel) | \
26 (1 << XENFEAT_hvm_callback_vector))
27/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
28 * up regardless whether this CONFIG option is enabled or not, but it
29 * clarifies what the right flags need to be.
30 */
31#else
32#define PVH_FEATURES_STR ""
33#define PVH_FEATURES (0)
34#endif
35
16 __INIT 36 __INIT
17ENTRY(startup_xen) 37ENTRY(startup_xen)
18 cld 38 cld
@@ -95,7 +115,10 @@ NEXT_HYPERCALL(arch_6)
95#endif 115#endif
96 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) 116 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
97 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) 117 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
98 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 118 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
119 ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
120 (1 << XENFEAT_writable_page_tables) |
121 (1 << XENFEAT_dom0))
99 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 122 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
100 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 123 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
101 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 124 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 95f8c6142328..1cb6f4c37300 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -123,4 +123,5 @@ __visible void xen_adjust_exception_frame(void);
123 123
124extern int xen_panic_handler_init(void); 124extern int xen_panic_handler_init(void);
125 125
126void xen_pvh_secondary_vcpu_init(int cpu);
126#endif /* XEN_OPS_H */ 127#endif /* XEN_OPS_H */