aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig62
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/boot.h1
-rw-r--r--arch/x86/boot/compressed/eboot.c2
-rw-r--r--arch/x86/boot/compressed/head_32.S31
-rw-r--r--arch/x86/boot/compressed/head_64.S1
-rw-r--r--arch/x86/boot/compressed/misc.c77
-rw-r--r--arch/x86/boot/printf.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/ia32/ia32entry.S2
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative.h14
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/asm.h6
-rw-r--r--arch/x86/include/asm/bitops.h46
-rw-r--r--arch/x86/include/asm/bootparam_utils.h4
-rw-r--r--arch/x86/include/asm/checksum_32.h22
-rw-r--r--arch/x86/include/asm/checksum_64.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h17
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h120
-rw-r--r--arch/x86/include/asm/hypervisor.h2
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/kprobes.h10
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/kvm_para.h38
-rw-r--r--arch/x86/include/asm/mce.h16
-rw-r--r--arch/x86/include/asm/microcode_amd.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h20
-rw-r--r--arch/x86/include/asm/mutex_64.h30
-rw-r--r--arch/x86/include/asm/page_32_types.h2
-rw-r--r--arch/x86/include/asm/page_64_types.h5
-rw-r--r--arch/x86/include/asm/page_types.h5
-rw-r--r--arch/x86/include/asm/paravirt.h32
-rw-r--r--arch/x86/include/asm/paravirt_types.h17
-rw-r--r--arch/x86/include/asm/pgtable-2level.h48
-rw-r--r--arch/x86/include/asm/pgtable-3level.h3
-rw-r--r--arch/x86/include/asm/pgtable.h33
-rw-r--r--arch/x86/include/asm/pgtable_types.h17
-rw-r--r--arch/x86/include/asm/processor.h34
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/setup.h8
-rw-r--r--arch/x86/include/asm/special_insns.h2
-rw-r--r--arch/x86/include/asm/spinlock.h137
-rw-r--r--arch/x86/include/asm/spinlock_types.h16
-rw-r--r--arch/x86/include/asm/switch_to.h4
-rw-r--r--arch/x86/include/asm/sync_bitops.h24
-rw-r--r--arch/x86/include/asm/syscall.h3
-rw-r--r--arch/x86/include/asm/syscalls.h6
-rw-r--r--arch/x86/include/asm/sysfb.h98
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/traps.h6
-rw-r--r--arch/x86/include/asm/tsc.h1
-rw-r--r--arch/x86/include/asm/uaccess.h7
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/asm/vvar.h2
-rw-r--r--arch/x86/include/asm/xen/events.h1
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h16
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c25
-rw-r--r--arch/x86/kernel/alternative.c155
-rw-r--r--arch/x86/kernel/amd_nb.c13
-rw-r--r--arch/x86/kernel/apic/apic.c12
-rw-r--r--arch/x86/kernel/apic/io_apic.c14
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c24
-rw-r--r--arch/x86/kernel/cpu/common.c4
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c28
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c42
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c13
-rw-r--r--arch/x86/kernel/cpu/perf_event.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c181
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c32
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c258
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h10
-rw-r--r--arch/x86/kernel/cpu/vmware.c8
-rw-r--r--arch/x86/kernel/crash.c4
-rw-r--r--arch/x86/kernel/e820.c5
-rw-r--r--arch/x86/kernel/early-quirks.c14
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/irq.c8
-rw-r--r--arch/x86/kernel/irq_work.c4
-rw-r--r--arch/x86/kernel/jump_label.c16
-rw-r--r--arch/x86/kernel/kprobes/common.h5
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/kprobes/opt.c115
-rw-r--r--arch/x86/kernel/kvm.c268
-rw-r--r--arch/x86/kernel/microcode_amd.c36
-rw-r--r--arch/x86/kernel/microcode_amd_early.c27
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c18
-rw-r--r--arch/x86/kernel/paravirt.c9
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kernel/setup.c27
-rw-r--r--arch/x86/kernel/signal.c12
-rw-r--r--arch/x86/kernel/smp.c12
-rw-r--r--arch/x86/kernel/sys_x86_64.c2
-rw-r--r--arch/x86/kernel/syscall_32.c2
-rw-r--r--arch/x86/kernel/syscall_64.c5
-rw-r--r--arch/x86/kernel/sysfb.c74
-rw-r--r--arch/x86/kernel/sysfb_efi.c214
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c95
-rw-r--r--arch/x86/kernel/tboot.c10
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/lapic.c38
-rw-r--r--arch/x86/kvm/mmu.c181
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h178
-rw-r--r--arch/x86/kvm/pmu.c25
-rw-r--r--arch/x86/kvm/vmx.c441
-rw-r--r--arch/x86/kvm/x86.c224
-rw-r--r--arch/x86/lib/csum-wrappers_64.c12
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/lib/x86-opcode-map.txt42
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/ioremap.c5
-rw-r--r--arch/x86/mm/mmap.c6
-rw-r--r--arch/x86/mm/srat.c11
-rw-r--r--arch/x86/oprofile/nmi_int.c18
-rw-r--r--arch/x86/oprofile/op_model_amd.c24
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/mmconfig-shared.c7
-rw-r--r--arch/x86/pci/mrst.c41
-rw-r--r--arch/x86/platform/ce4100/ce4100.c1
-rw-r--r--arch/x86/power/cpu.c8
-rw-r--r--arch/x86/power/hibernate_64.c12
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk4
-rw-r--r--arch/x86/vdso/vclock_gettime.c16
-rw-r--r--arch/x86/xen/enlighten.c24
-rw-r--r--arch/x86/xen/irq.c25
-rw-r--r--arch/x86/xen/p2m.c22
-rw-r--r--arch/x86/xen/setup.c51
-rw-r--r--arch/x86/xen/smp.c19
-rw-r--r--arch/x86/xen/spinlock.c387
-rw-r--r--arch/x86/xen/xen-ops.h16
150 files changed, 3346 insertions, 1528 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b32ebf92b0ce..5c0ed72c02a2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -16,6 +16,7 @@ config X86_64
16 def_bool y 16 def_bool y
17 depends on 64BIT 17 depends on 64BIT
18 select X86_DEV_DMA_OPS 18 select X86_DEV_DMA_OPS
19 select ARCH_USE_CMPXCHG_LOCKREF
19 20
20### Arch settings 21### Arch settings
21config X86 22config X86
@@ -81,7 +82,6 @@ config X86
81 select HAVE_USER_RETURN_NOTIFIER 82 select HAVE_USER_RETURN_NOTIFIER
82 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 83 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
83 select HAVE_ARCH_JUMP_LABEL 84 select HAVE_ARCH_JUMP_LABEL
84 select HAVE_TEXT_POKE_SMP
85 select HAVE_GENERIC_HARDIRQS 85 select HAVE_GENERIC_HARDIRQS
86 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 86 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
87 select SPARSE_IRQ 87 select SPARSE_IRQ
@@ -632,6 +632,7 @@ config PARAVIRT_DEBUG
632config PARAVIRT_SPINLOCKS 632config PARAVIRT_SPINLOCKS
633 bool "Paravirtualization layer for spinlocks" 633 bool "Paravirtualization layer for spinlocks"
634 depends on PARAVIRT && SMP 634 depends on PARAVIRT && SMP
635 select UNINLINE_SPIN_UNLOCK
635 ---help--- 636 ---help---
636 Paravirtualized spinlocks allow a pvops backend to replace the 637 Paravirtualized spinlocks allow a pvops backend to replace the
637 spinlock implementation with something virtualization-friendly 638 spinlock implementation with something virtualization-friendly
@@ -656,6 +657,15 @@ config KVM_GUEST
656 underlying device model, the host provides the guest with 657 underlying device model, the host provides the guest with
657 timing infrastructure such as time of day, and system time 658 timing infrastructure such as time of day, and system time
658 659
660config KVM_DEBUG_FS
661 bool "Enable debug information for KVM Guests in debugfs"
662 depends on KVM_GUEST && DEBUG_FS
663 default n
664 ---help---
665 This option enables collection of various statistics for KVM guest.
666 Statistics are displayed in debugfs filesystem. Enabling this option
667 may incur significant overhead.
668
659source "arch/x86/lguest/Kconfig" 669source "arch/x86/lguest/Kconfig"
660 670
661config PARAVIRT_TIME_ACCOUNTING 671config PARAVIRT_TIME_ACCOUNTING
@@ -1344,8 +1354,12 @@ config ARCH_SELECT_MEMORY_MODEL
1344 depends on ARCH_SPARSEMEM_ENABLE 1354 depends on ARCH_SPARSEMEM_ENABLE
1345 1355
1346config ARCH_MEMORY_PROBE 1356config ARCH_MEMORY_PROBE
1347 def_bool y 1357 bool "Enable sysfs memory/probe interface"
1348 depends on X86_64 && MEMORY_HOTPLUG 1358 depends on X86_64 && MEMORY_HOTPLUG
1359 help
1360 This option enables a sysfs memory/probe interface for testing.
1361 See Documentation/memory-hotplug.txt for more information.
1362 If you are unsure how to answer this question, answer N.
1349 1363
1350config ARCH_PROC_KCORE_TEXT 1364config ARCH_PROC_KCORE_TEXT
1351 def_bool y 1365 def_bool y
@@ -1627,9 +1641,9 @@ config KEXEC
1627 1641
1628 It is an ongoing process to be certain the hardware in a machine 1642 It is an ongoing process to be certain the hardware in a machine
1629 is properly shutdown, so do not be surprised if this code does not 1643 is properly shutdown, so do not be surprised if this code does not
1630 initially work for you. It may help to enable device hotplugging 1644 initially work for you. As of this writing the exact hardware
1631 support. As of this writing the exact hardware interface is 1645 interface is strongly in flux, so no good recommendation can be
1632 strongly in flux, so no good recommendation can be made. 1646 made.
1633 1647
1634config CRASH_DUMP 1648config CRASH_DUMP
1635 bool "kernel crash dumps" 1649 bool "kernel crash dumps"
@@ -1716,9 +1730,10 @@ config X86_NEED_RELOCS
1716 depends on X86_32 && RELOCATABLE 1730 depends on X86_32 && RELOCATABLE
1717 1731
1718config PHYSICAL_ALIGN 1732config PHYSICAL_ALIGN
1719 hex "Alignment value to which kernel should be aligned" if X86_32 1733 hex "Alignment value to which kernel should be aligned"
1720 default "0x1000000" 1734 default "0x1000000"
1721 range 0x2000 0x1000000 1735 range 0x2000 0x1000000 if X86_32
1736 range 0x200000 0x1000000 if X86_64
1722 ---help--- 1737 ---help---
1723 This value puts the alignment restrictions on physical address 1738 This value puts the alignment restrictions on physical address
1724 where kernel is loaded and run from. Kernel is compiled for an 1739 where kernel is loaded and run from. Kernel is compiled for an
@@ -1736,6 +1751,9 @@ config PHYSICAL_ALIGN
1736 end result is that kernel runs from a physical address meeting 1751 end result is that kernel runs from a physical address meeting
1737 above alignment restrictions. 1752 above alignment restrictions.
1738 1753
1754 On 32-bit this value must be a multiple of 0x2000. On 64-bit
1755 this value must be a multiple of 0x200000.
1756
1739 Don't change this unless you know what you are doing. 1757 Don't change this unless you know what you are doing.
1740 1758
1741config HOTPLUG_CPU 1759config HOTPLUG_CPU
@@ -2270,6 +2288,32 @@ config RAPIDIO
2270 2288
2271source "drivers/rapidio/Kconfig" 2289source "drivers/rapidio/Kconfig"
2272 2290
2291config X86_SYSFB
2292 bool "Mark VGA/VBE/EFI FB as generic system framebuffer"
2293 help
2294 Firmwares often provide initial graphics framebuffers so the BIOS,
2295 bootloader or kernel can show basic video-output during boot for
2296 user-guidance and debugging. Historically, x86 used the VESA BIOS
2297 Extensions and EFI-framebuffers for this, which are mostly limited
2298 to x86.
2299 This option, if enabled, marks VGA/VBE/EFI framebuffers as generic
2300 framebuffers so the new generic system-framebuffer drivers can be
2301 used on x86. If the framebuffer is not compatible with the generic
2302 modes, it is adverticed as fallback platform framebuffer so legacy
2303 drivers like efifb, vesafb and uvesafb can pick it up.
2304 If this option is not selected, all system framebuffers are always
2305 marked as fallback platform framebuffers as usual.
2306
2307 Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will
2308 not be able to pick up generic system framebuffers if this option
2309 is selected. You are highly encouraged to enable simplefb as
2310 replacement if you select this option. simplefb can correctly deal
2311 with generic system framebuffers. But you should still keep vesafb
2312 and others enabled as fallback if a system framebuffer is
2313 incompatible with simplefb.
2314
2315 If unsure, say Y.
2316
2273endmenu 2317endmenu
2274 2318
2275 2319
@@ -2332,10 +2376,6 @@ config HAVE_ATOMIC_IOMAP
2332 def_bool y 2376 def_bool y
2333 depends on X86_32 2377 depends on X86_32
2334 2378
2335config HAVE_TEXT_POKE_SMP
2336 bool
2337 select STOP_MACHINE if SMP
2338
2339config X86_DEV_DMA_OPS 2379config X86_DEV_DMA_OPS
2340 bool 2380 bool
2341 depends on X86_64 || STA2X11 2381 depends on X86_64 || STA2X11
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 07639c656fcd..41250fb33985 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -16,6 +16,10 @@ endif
16# e.g.: obj-y += foo_$(BITS).o 16# e.g.: obj-y += foo_$(BITS).o
17export BITS 17export BITS
18 18
19ifdef CONFIG_X86_NEED_RELOCS
20 LDFLAGS_vmlinux := --emit-relocs
21endif
22
19ifeq ($(CONFIG_X86_32),y) 23ifeq ($(CONFIG_X86_32),y)
20 BITS := 32 24 BITS := 32
21 UTS_MACHINE := i386 25 UTS_MACHINE := i386
@@ -25,10 +29,6 @@ ifeq ($(CONFIG_X86_32),y)
25 KBUILD_AFLAGS += $(biarch) 29 KBUILD_AFLAGS += $(biarch)
26 KBUILD_CFLAGS += $(biarch) 30 KBUILD_CFLAGS += $(biarch)
27 31
28 ifdef CONFIG_RELOCATABLE
29 LDFLAGS_vmlinux := --emit-relocs
30 endif
31
32 KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return 32 KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
33 33
34 # Never want PIC in a 32-bit kernel, prevent breakage with GCC built 34 # Never want PIC in a 32-bit kernel, prevent breakage with GCC built
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 5b7531966b84..ef72baeff484 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -355,6 +355,7 @@ int strncmp(const char *cs, const char *ct, size_t count);
355size_t strnlen(const char *s, size_t maxlen); 355size_t strnlen(const char *s, size_t maxlen);
356unsigned int atou(const char *s); 356unsigned int atou(const char *s);
357unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); 357unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
358size_t strlen(const char *s);
358 359
359/* tty.c */ 360/* tty.c */
360void puts(const char *); 361void puts(const char *);
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index d606463aa6d6..b7388a425f09 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -225,7 +225,7 @@ static void low_free(unsigned long size, unsigned long addr)
225 unsigned long nr_pages; 225 unsigned long nr_pages;
226 226
227 nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; 227 nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
228 efi_call_phys2(sys_table->boottime->free_pages, addr, size); 228 efi_call_phys2(sys_table->boottime->free_pages, addr, nr_pages);
229} 229}
230 230
231static void find_bits(unsigned long mask, u8 *pos, u8 *size) 231static void find_bits(unsigned long mask, u8 *pos, u8 *size)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1e3184f6072f..5d6f6891b188 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -181,8 +181,9 @@ relocated:
181/* 181/*
182 * Do the decompression, and jump to the new kernel.. 182 * Do the decompression, and jump to the new kernel..
183 */ 183 */
184 leal z_extract_offset_negative(%ebx), %ebp
185 /* push arguments for decompress_kernel: */ 184 /* push arguments for decompress_kernel: */
185 pushl $z_output_len /* decompressed length */
186 leal z_extract_offset_negative(%ebx), %ebp
186 pushl %ebp /* output address */ 187 pushl %ebp /* output address */
187 pushl $z_input_len /* input_len */ 188 pushl $z_input_len /* input_len */
188 leal input_data(%ebx), %eax 189 leal input_data(%ebx), %eax
@@ -191,33 +192,7 @@ relocated:
191 pushl %eax /* heap area */ 192 pushl %eax /* heap area */
192 pushl %esi /* real mode pointer */ 193 pushl %esi /* real mode pointer */
193 call decompress_kernel 194 call decompress_kernel
194 addl $20, %esp 195 addl $24, %esp
195
196#if CONFIG_RELOCATABLE
197/*
198 * Find the address of the relocations.
199 */
200 leal z_output_len(%ebp), %edi
201
202/*
203 * Calculate the delta between where vmlinux was compiled to run
204 * and where it was actually loaded.
205 */
206 movl %ebp, %ebx
207 subl $LOAD_PHYSICAL_ADDR, %ebx
208 jz 2f /* Nothing to be done if loaded at compiled addr. */
209/*
210 * Process relocations.
211 */
212
2131: subl $4, %edi
214 movl (%edi), %ecx
215 testl %ecx, %ecx
216 jz 2f
217 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
218 jmp 1b
2192:
220#endif
221 196
222/* 197/*
223 * Jump to the decompressed kernel. 198 * Jump to the decompressed kernel.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 06e71c2c16bf..c337422b575d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -338,6 +338,7 @@ relocated:
338 leaq input_data(%rip), %rdx /* input_data */ 338 leaq input_data(%rip), %rdx /* input_data */
339 movl $z_input_len, %ecx /* input_len */ 339 movl $z_input_len, %ecx /* input_len */
340 movq %rbp, %r8 /* output target address */ 340 movq %rbp, %r8 /* output target address */
341 movq $z_output_len, %r9 /* decompressed length */
341 call decompress_kernel 342 call decompress_kernel
342 popq %rsi 343 popq %rsi
343 344
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 0319c88290a5..434f077d2c4d 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -271,6 +271,79 @@ static void error(char *x)
271 asm("hlt"); 271 asm("hlt");
272} 272}
273 273
274#if CONFIG_X86_NEED_RELOCS
275static void handle_relocations(void *output, unsigned long output_len)
276{
277 int *reloc;
278 unsigned long delta, map, ptr;
279 unsigned long min_addr = (unsigned long)output;
280 unsigned long max_addr = min_addr + output_len;
281
282 /*
283 * Calculate the delta between where vmlinux was linked to load
284 * and where it was actually loaded.
285 */
286 delta = min_addr - LOAD_PHYSICAL_ADDR;
287 if (!delta) {
288 debug_putstr("No relocation needed... ");
289 return;
290 }
291 debug_putstr("Performing relocations... ");
292
293 /*
294 * The kernel contains a table of relocation addresses. Those
295 * addresses have the final load address of the kernel in virtual
296 * memory. We are currently working in the self map. So we need to
297 * create an adjustment for kernel memory addresses to the self map.
298 * This will involve subtracting out the base address of the kernel.
299 */
300 map = delta - __START_KERNEL_map;
301
302 /*
303 * Process relocations: 32 bit relocations first then 64 bit after.
304 * Two sets of binary relocations are added to the end of the kernel
305 * before compression. Each relocation table entry is the kernel
306 * address of the location which needs to be updated stored as a
307 * 32-bit value which is sign extended to 64 bits.
308 *
309 * Format is:
310 *
311 * kernel bits...
312 * 0 - zero terminator for 64 bit relocations
313 * 64 bit relocation repeated
314 * 0 - zero terminator for 32 bit relocations
315 * 32 bit relocation repeated
316 *
317 * So we work backwards from the end of the decompressed image.
318 */
319 for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
320 int extended = *reloc;
321 extended += map;
322
323 ptr = (unsigned long)extended;
324 if (ptr < min_addr || ptr > max_addr)
325 error("32-bit relocation outside of kernel!\n");
326
327 *(uint32_t *)ptr += delta;
328 }
329#ifdef CONFIG_X86_64
330 for (reloc--; *reloc; reloc--) {
331 long extended = *reloc;
332 extended += map;
333
334 ptr = (unsigned long)extended;
335 if (ptr < min_addr || ptr > max_addr)
336 error("64-bit relocation outside of kernel!\n");
337
338 *(uint64_t *)ptr += delta;
339 }
340#endif
341}
342#else
343static inline void handle_relocations(void *output, unsigned long output_len)
344{ }
345#endif
346
274static void parse_elf(void *output) 347static void parse_elf(void *output)
275{ 348{
276#ifdef CONFIG_X86_64 349#ifdef CONFIG_X86_64
@@ -325,7 +398,8 @@ static void parse_elf(void *output)
325asmlinkage void decompress_kernel(void *rmode, memptr heap, 398asmlinkage void decompress_kernel(void *rmode, memptr heap,
326 unsigned char *input_data, 399 unsigned char *input_data,
327 unsigned long input_len, 400 unsigned long input_len,
328 unsigned char *output) 401 unsigned char *output,
402 unsigned long output_len)
329{ 403{
330 real_mode = rmode; 404 real_mode = rmode;
331 405
@@ -365,6 +439,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
365 debug_putstr("\nDecompressing Linux... "); 439 debug_putstr("\nDecompressing Linux... ");
366 decompress(input_data, input_len, NULL, NULL, output, NULL, error); 440 decompress(input_data, input_len, NULL, NULL, output, NULL, error);
367 parse_elf(output); 441 parse_elf(output);
442 handle_relocations(output, output_len);
368 debug_putstr("done.\nBooting the kernel.\n"); 443 debug_putstr("done.\nBooting the kernel.\n");
369 return; 444 return;
370} 445}
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index cdac91ca55d3..565083c16e5c 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -55,7 +55,7 @@ static char *number(char *str, long num, int base, int size, int precision,
55 locase = (type & SMALL); 55 locase = (type & SMALL);
56 if (type & LEFT) 56 if (type & LEFT)
57 type &= ~ZEROPAD; 57 type &= ~ZEROPAD;
58 if (base < 2 || base > 36) 58 if (base < 2 || base > 16)
59 return NULL; 59 return NULL;
60 c = (type & ZEROPAD) ? '0' : ' '; 60 c = (type & ZEROPAD) ? '0' : ' ';
61 sign = 0; 61 sign = 0;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index bccfca68430e..665a730307f2 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -457,7 +457,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
457 else 457 else
458 put_user_ex(0, &frame->uc.uc_flags); 458 put_user_ex(0, &frame->uc.uc_flags);
459 put_user_ex(0, &frame->uc.uc_link); 459 put_user_ex(0, &frame->uc.uc_link);
460 err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); 460 compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
461 461
462 if (ksig->ka.sa.sa_flags & SA_RESTORER) 462 if (ksig->ka.sa.sa_flags & SA_RESTORER)
463 restorer = ksig->ka.sa.sa_restorer; 463 restorer = ksig->ka.sa.sa_restorer;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 474dc1b59f72..4299eb05023c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -452,7 +452,7 @@ ia32_badsys:
452 452
453 CFI_ENDPROC 453 CFI_ENDPROC
454 454
455 .macro PTREGSCALL label, func, arg 455 .macro PTREGSCALL label, func
456 ALIGN 456 ALIGN
457GLOBAL(\label) 457GLOBAL(\label)
458 leaq \func(%rip),%rax 458 leaq \func(%rip),%rax
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2dfac58f3b11..b1977bad5435 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -86,6 +86,7 @@ extern int acpi_pci_disabled;
86extern int acpi_skip_timer_override; 86extern int acpi_skip_timer_override;
87extern int acpi_use_timer_override; 87extern int acpi_use_timer_override;
88extern int acpi_fix_pin2_polarity; 88extern int acpi_fix_pin2_polarity;
89extern int acpi_disable_cmcff;
89 90
90extern u8 acpi_sci_flags; 91extern u8 acpi_sci_flags;
91extern int acpi_sci_override_gsi; 92extern int acpi_sci_override_gsi;
@@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
168 169
169#define acpi_lapic 0 170#define acpi_lapic 0
170#define acpi_ioapic 0 171#define acpi_ioapic 0
172#define acpi_disable_cmcff 0
171static inline void acpi_noirq_set(void) { } 173static inline void acpi_noirq_set(void) { }
172static inline void acpi_disable_pci(void) { } 174static inline void acpi_disable_pci(void) { }
173static inline void disable_acpi(void) { } 175static inline void disable_acpi(void) { }
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 58ed6d96a6ac..0a3f9c9f98d5 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -5,6 +5,7 @@
5#include <linux/stddef.h> 5#include <linux/stddef.h>
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <asm/asm.h> 7#include <asm/asm.h>
8#include <asm/ptrace.h>
8 9
9/* 10/*
10 * Alternative inline assembly for SMP. 11 * Alternative inline assembly for SMP.
@@ -220,20 +221,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
220 * no thread can be preempted in the instructions being modified (no iret to an 221 * no thread can be preempted in the instructions being modified (no iret to an
221 * invalid instruction possible) or if the instructions are changed from a 222 * invalid instruction possible) or if the instructions are changed from a
222 * consistent state to another consistent state atomically. 223 * consistent state to another consistent state atomically.
223 * More care must be taken when modifying code in the SMP case because of
224 * Intel's errata. text_poke_smp() takes care that errata, but still
225 * doesn't support NMI/MCE handler code modifying.
226 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 224 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
227 * inconsistent instruction while you patch. 225 * inconsistent instruction while you patch.
228 */ 226 */
229struct text_poke_param {
230 void *addr;
231 const void *opcode;
232 size_t len;
233};
234
235extern void *text_poke(void *addr, const void *opcode, size_t len); 227extern void *text_poke(void *addr, const void *opcode, size_t len);
236extern void *text_poke_smp(void *addr, const void *opcode, size_t len); 228extern int poke_int3_handler(struct pt_regs *regs);
237extern void text_poke_smp_batch(struct text_poke_param *params, int n); 229extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
238 230
239#endif /* _ASM_X86_ALTERNATIVE_H */ 231#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f8119b582c3c..1d2091a226bc 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -715,4 +715,6 @@ static inline void exiting_ack_irq(void)
715 ack_APIC_irq(); 715 ack_APIC_irq();
716} 716}
717 717
718extern void ioapic_zap_locks(void);
719
718#endif /* _ASM_X86_APIC_H */ 720#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 1c2d247f65ce..4582e8e1cd1a 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,21 +3,25 @@
3 3
4#ifdef __ASSEMBLY__ 4#ifdef __ASSEMBLY__
5# define __ASM_FORM(x) x 5# define __ASM_FORM(x) x
6# define __ASM_FORM_RAW(x) x
6# define __ASM_FORM_COMMA(x) x, 7# define __ASM_FORM_COMMA(x) x,
7#else 8#else
8# define __ASM_FORM(x) " " #x " " 9# define __ASM_FORM(x) " " #x " "
10# define __ASM_FORM_RAW(x) #x
9# define __ASM_FORM_COMMA(x) " " #x "," 11# define __ASM_FORM_COMMA(x) " " #x ","
10#endif 12#endif
11 13
12#ifdef CONFIG_X86_32 14#ifdef CONFIG_X86_32
13# define __ASM_SEL(a,b) __ASM_FORM(a) 15# define __ASM_SEL(a,b) __ASM_FORM(a)
16# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
14#else 17#else
15# define __ASM_SEL(a,b) __ASM_FORM(b) 18# define __ASM_SEL(a,b) __ASM_FORM(b)
19# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
16#endif 20#endif
17 21
18#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ 22#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
19 inst##q##__VA_ARGS__) 23 inst##q##__VA_ARGS__)
20#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) 24#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg)
21 25
22#define _ASM_PTR __ASM_SEL(.long, .quad) 26#define _ASM_PTR __ASM_SEL(.long, .quad)
23#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) 27#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 6dfd0195bb55..41639ce8fd63 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -15,6 +15,14 @@
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <asm/alternative.h> 16#include <asm/alternative.h>
17 17
18#if BITS_PER_LONG == 32
19# define _BITOPS_LONG_SHIFT 5
20#elif BITS_PER_LONG == 64
21# define _BITOPS_LONG_SHIFT 6
22#else
23# error "Unexpected BITS_PER_LONG"
24#endif
25
18#define BIT_64(n) (U64_C(1) << (n)) 26#define BIT_64(n) (U64_C(1) << (n))
19 27
20/* 28/*
@@ -59,7 +67,7 @@
59 * restricted to acting on a single-word quantity. 67 * restricted to acting on a single-word quantity.
60 */ 68 */
61static __always_inline void 69static __always_inline void
62set_bit(unsigned int nr, volatile unsigned long *addr) 70set_bit(long nr, volatile unsigned long *addr)
63{ 71{
64 if (IS_IMMEDIATE(nr)) { 72 if (IS_IMMEDIATE(nr)) {
65 asm volatile(LOCK_PREFIX "orb %1,%0" 73 asm volatile(LOCK_PREFIX "orb %1,%0"
@@ -81,7 +89,7 @@ set_bit(unsigned int nr, volatile unsigned long *addr)
81 * If it's called on the same region of memory simultaneously, the effect 89 * If it's called on the same region of memory simultaneously, the effect
82 * may be that only one operation succeeds. 90 * may be that only one operation succeeds.
83 */ 91 */
84static inline void __set_bit(int nr, volatile unsigned long *addr) 92static inline void __set_bit(long nr, volatile unsigned long *addr)
85{ 93{
86 asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); 94 asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
87} 95}
@@ -97,7 +105,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
97 * in order to ensure changes are visible on other processors. 105 * in order to ensure changes are visible on other processors.
98 */ 106 */
99static __always_inline void 107static __always_inline void
100clear_bit(int nr, volatile unsigned long *addr) 108clear_bit(long nr, volatile unsigned long *addr)
101{ 109{
102 if (IS_IMMEDIATE(nr)) { 110 if (IS_IMMEDIATE(nr)) {
103 asm volatile(LOCK_PREFIX "andb %1,%0" 111 asm volatile(LOCK_PREFIX "andb %1,%0"
@@ -118,13 +126,13 @@ clear_bit(int nr, volatile unsigned long *addr)
118 * clear_bit() is atomic and implies release semantics before the memory 126 * clear_bit() is atomic and implies release semantics before the memory
119 * operation. It can be used for an unlock. 127 * operation. It can be used for an unlock.
120 */ 128 */
121static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) 129static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
122{ 130{
123 barrier(); 131 barrier();
124 clear_bit(nr, addr); 132 clear_bit(nr, addr);
125} 133}
126 134
127static inline void __clear_bit(int nr, volatile unsigned long *addr) 135static inline void __clear_bit(long nr, volatile unsigned long *addr)
128{ 136{
129 asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); 137 asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
130} 138}
@@ -141,7 +149,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
141 * No memory barrier is required here, because x86 cannot reorder stores past 149 * No memory barrier is required here, because x86 cannot reorder stores past
142 * older loads. Same principle as spin_unlock. 150 * older loads. Same principle as spin_unlock.
143 */ 151 */
144static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) 152static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
145{ 153{
146 barrier(); 154 barrier();
147 __clear_bit(nr, addr); 155 __clear_bit(nr, addr);
@@ -159,7 +167,7 @@ static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
159 * If it's called on the same region of memory simultaneously, the effect 167 * If it's called on the same region of memory simultaneously, the effect
160 * may be that only one operation succeeds. 168 * may be that only one operation succeeds.
161 */ 169 */
162static inline void __change_bit(int nr, volatile unsigned long *addr) 170static inline void __change_bit(long nr, volatile unsigned long *addr)
163{ 171{
164 asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); 172 asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
165} 173}
@@ -173,7 +181,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
173 * Note that @nr may be almost arbitrarily large; this function is not 181 * Note that @nr may be almost arbitrarily large; this function is not
174 * restricted to acting on a single-word quantity. 182 * restricted to acting on a single-word quantity.
175 */ 183 */
176static inline void change_bit(int nr, volatile unsigned long *addr) 184static inline void change_bit(long nr, volatile unsigned long *addr)
177{ 185{
178 if (IS_IMMEDIATE(nr)) { 186 if (IS_IMMEDIATE(nr)) {
179 asm volatile(LOCK_PREFIX "xorb %1,%0" 187 asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -194,7 +202,7 @@ static inline void change_bit(int nr, volatile unsigned long *addr)
194 * This operation is atomic and cannot be reordered. 202 * This operation is atomic and cannot be reordered.
195 * It also implies a memory barrier. 203 * It also implies a memory barrier.
196 */ 204 */
197static inline int test_and_set_bit(int nr, volatile unsigned long *addr) 205static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
198{ 206{
199 int oldbit; 207 int oldbit;
200 208
@@ -212,7 +220,7 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
212 * This is the same as test_and_set_bit on x86. 220 * This is the same as test_and_set_bit on x86.
213 */ 221 */
214static __always_inline int 222static __always_inline int
215test_and_set_bit_lock(int nr, volatile unsigned long *addr) 223test_and_set_bit_lock(long nr, volatile unsigned long *addr)
216{ 224{
217 return test_and_set_bit(nr, addr); 225 return test_and_set_bit(nr, addr);
218} 226}
@@ -226,7 +234,7 @@ test_and_set_bit_lock(int nr, volatile unsigned long *addr)
226 * If two examples of this operation race, one can appear to succeed 234 * If two examples of this operation race, one can appear to succeed
227 * but actually fail. You must protect multiple accesses with a lock. 235 * but actually fail. You must protect multiple accesses with a lock.
228 */ 236 */
229static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) 237static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
230{ 238{
231 int oldbit; 239 int oldbit;
232 240
@@ -245,7 +253,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
245 * This operation is atomic and cannot be reordered. 253 * This operation is atomic and cannot be reordered.
246 * It also implies a memory barrier. 254 * It also implies a memory barrier.
247 */ 255 */
248static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) 256static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
249{ 257{
250 int oldbit; 258 int oldbit;
251 259
@@ -272,7 +280,7 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
272 * accessed from a hypervisor on the same CPU if running in a VM: don't change 280 * accessed from a hypervisor on the same CPU if running in a VM: don't change
273 * this without also updating arch/x86/kernel/kvm.c 281 * this without also updating arch/x86/kernel/kvm.c
274 */ 282 */
275static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 283static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
276{ 284{
277 int oldbit; 285 int oldbit;
278 286
@@ -284,7 +292,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
284} 292}
285 293
286/* WARNING: non atomic and it can be reordered! */ 294/* WARNING: non atomic and it can be reordered! */
287static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) 295static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
288{ 296{
289 int oldbit; 297 int oldbit;
290 298
@@ -304,7 +312,7 @@ static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
304 * This operation is atomic and cannot be reordered. 312 * This operation is atomic and cannot be reordered.
305 * It also implies a memory barrier. 313 * It also implies a memory barrier.
306 */ 314 */
307static inline int test_and_change_bit(int nr, volatile unsigned long *addr) 315static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
308{ 316{
309 int oldbit; 317 int oldbit;
310 318
@@ -315,13 +323,13 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
315 return oldbit; 323 return oldbit;
316} 324}
317 325
318static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) 326static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr)
319{ 327{
320 return ((1UL << (nr % BITS_PER_LONG)) & 328 return ((1UL << (nr & (BITS_PER_LONG-1))) &
321 (addr[nr / BITS_PER_LONG])) != 0; 329 (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
322} 330}
323 331
324static inline int variable_test_bit(int nr, volatile const unsigned long *addr) 332static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
325{ 333{
326 int oldbit; 334 int oldbit;
327 335
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index 653668d140f9..4a8cb8d7cbd5 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -35,9 +35,9 @@ static void sanitize_boot_params(struct boot_params *boot_params)
35 */ 35 */
36 if (boot_params->sentinel) { 36 if (boot_params->sentinel) {
37 /* fields in boot_params are left uninitialized, clear them */ 37 /* fields in boot_params are left uninitialized, clear them */
38 memset(&boot_params->olpc_ofw_header, 0, 38 memset(&boot_params->ext_ramdisk_image, 0,
39 (char *)&boot_params->efi_info - 39 (char *)&boot_params->efi_info -
40 (char *)&boot_params->olpc_ofw_header); 40 (char *)&boot_params->ext_ramdisk_image);
41 memset(&boot_params->kbd_status, 0, 41 memset(&boot_params->kbd_status, 0,
42 (char *)&boot_params->hdr - 42 (char *)&boot_params->hdr -
43 (char *)&boot_params->kbd_status); 43 (char *)&boot_params->kbd_status);
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 46fc474fd819..f50de6951738 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -49,9 +49,15 @@ static inline __wsum csum_partial_copy_from_user(const void __user *src,
49 int len, __wsum sum, 49 int len, __wsum sum,
50 int *err_ptr) 50 int *err_ptr)
51{ 51{
52 __wsum ret;
53
52 might_sleep(); 54 might_sleep();
53 return csum_partial_copy_generic((__force void *)src, dst, 55 stac();
54 len, sum, err_ptr, NULL); 56 ret = csum_partial_copy_generic((__force void *)src, dst,
57 len, sum, err_ptr, NULL);
58 clac();
59
60 return ret;
55} 61}
56 62
57/* 63/*
@@ -176,10 +182,16 @@ static inline __wsum csum_and_copy_to_user(const void *src,
176 int len, __wsum sum, 182 int len, __wsum sum,
177 int *err_ptr) 183 int *err_ptr)
178{ 184{
185 __wsum ret;
186
179 might_sleep(); 187 might_sleep();
180 if (access_ok(VERIFY_WRITE, dst, len)) 188 if (access_ok(VERIFY_WRITE, dst, len)) {
181 return csum_partial_copy_generic(src, (__force void *)dst, 189 stac();
182 len, sum, NULL, err_ptr); 190 ret = csum_partial_copy_generic(src, (__force void *)dst,
191 len, sum, NULL, err_ptr);
192 clac();
193 return ret;
194 }
183 195
184 if (len) 196 if (len)
185 *err_ptr = -EFAULT; 197 *err_ptr = -EFAULT;
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 9bfdc41629ec..e6fd8a026c7b 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -133,7 +133,7 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum);
133 133
134 134
135/* Do not call this directly. Use the wrappers below */ 135/* Do not call this directly. Use the wrappers below */
136extern __wsum csum_partial_copy_generic(const void *src, const void *dst, 136extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst,
137 int len, __wsum sum, 137 int len, __wsum sum,
138 int *src_err_ptr, int *dst_err_ptr); 138 int *src_err_ptr, int *dst_err_ptr);
139 139
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 47538a61c91b..d3f5c63078d8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -366,9 +366,10 @@ extern bool __static_cpu_has_safe(u16 bit);
366 */ 366 */
367static __always_inline __pure bool __static_cpu_has(u16 bit) 367static __always_inline __pure bool __static_cpu_has(u16 bit)
368{ 368{
369#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 369#ifdef CC_HAVE_ASM_GOTO
370 370
371#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS 371#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
372
372 /* 373 /*
373 * Catch too early usage of this before alternatives 374 * Catch too early usage of this before alternatives
374 * have run. 375 * have run.
@@ -384,6 +385,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
384 ".previous\n" 385 ".previous\n"
385 /* skipping size check since replacement size = 0 */ 386 /* skipping size check since replacement size = 0 */
386 : : "i" (X86_FEATURE_ALWAYS) : : t_warn); 387 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
388
387#endif 389#endif
388 390
389 asm goto("1: jmp %l[t_no]\n" 391 asm goto("1: jmp %l[t_no]\n"
@@ -406,7 +408,9 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
406 warn_pre_alternatives(); 408 warn_pre_alternatives();
407 return false; 409 return false;
408#endif 410#endif
409#else /* GCC_VERSION >= 40500 */ 411
412#else /* CC_HAVE_ASM_GOTO */
413
410 u8 flag; 414 u8 flag;
411 /* Open-coded due to __stringify() in ALTERNATIVE() */ 415 /* Open-coded due to __stringify() in ALTERNATIVE() */
412 asm volatile("1: movb $0,%0\n" 416 asm volatile("1: movb $0,%0\n"
@@ -427,7 +431,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
427 ".previous\n" 431 ".previous\n"
428 : "=qm" (flag) : "i" (bit)); 432 : "=qm" (flag) : "i" (bit));
429 return flag; 433 return flag;
430#endif 434
435#endif /* CC_HAVE_ASM_GOTO */
431} 436}
432 437
433#define static_cpu_has(bit) \ 438#define static_cpu_has(bit) \
@@ -441,7 +446,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
441 446
442static __always_inline __pure bool _static_cpu_has_safe(u16 bit) 447static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
443{ 448{
444#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 449#ifdef CC_HAVE_ASM_GOTO
445/* 450/*
446 * We need to spell the jumps to the compiler because, depending on the offset, 451 * We need to spell the jumps to the compiler because, depending on the offset,
447 * the replacement jump can be bigger than the original jump, and this we cannot 452 * the replacement jump can be bigger than the original jump, and this we cannot
@@ -475,7 +480,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
475 return false; 480 return false;
476 t_dynamic: 481 t_dynamic:
477 return __static_cpu_has_safe(bit); 482 return __static_cpu_has_safe(bit);
478#else /* GCC_VERSION >= 40500 */ 483#else
479 u8 flag; 484 u8 flag;
480 /* Open-coded due to __stringify() in ALTERNATIVE() */ 485 /* Open-coded due to __stringify() in ALTERNATIVE() */
481 asm volatile("1: movb $2,%0\n" 486 asm volatile("1: movb $2,%0\n"
@@ -511,7 +516,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
511 : "=qm" (flag) 516 : "=qm" (flag)
512 : "i" (bit), "i" (X86_FEATURE_ALWAYS)); 517 : "i" (bit), "i" (X86_FEATURE_ALWAYS));
513 return (flag == 2 ? __static_cpu_has_safe(bit) : flag); 518 return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
514#endif 519#endif /* CC_HAVE_ASM_GOTO */
515} 520}
516 521
517#define static_cpu_has_safe(bit) \ 522#define static_cpu_has_safe(bit) \
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index cccd07fa5e3a..779c2efe2e97 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -29,7 +29,7 @@ extern void e820_setup_gap(void);
29extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, 29extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
30 unsigned long start_addr, unsigned long long end_addr); 30 unsigned long start_addr, unsigned long long end_addr);
31struct setup_data; 31struct setup_data;
32extern void parse_e820_ext(struct setup_data *data); 32extern void parse_e820_ext(u64 phys_addr, u32 data_len);
33 33
34#if defined(CONFIG_X86_64) || \ 34#if defined(CONFIG_X86_64) || \
35 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) 35 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e4ac559c4a24..92b3bae08b74 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -26,56 +26,56 @@
26#include <asm/sections.h> 26#include <asm/sections.h>
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern asmlinkage void apic_timer_interrupt(void);
30extern void x86_platform_ipi(void); 30extern asmlinkage void x86_platform_ipi(void);
31extern void kvm_posted_intr_ipi(void); 31extern asmlinkage void kvm_posted_intr_ipi(void);
32extern void error_interrupt(void); 32extern asmlinkage void error_interrupt(void);
33extern void irq_work_interrupt(void); 33extern asmlinkage void irq_work_interrupt(void);
34 34
35extern void spurious_interrupt(void); 35extern asmlinkage void spurious_interrupt(void);
36extern void thermal_interrupt(void); 36extern asmlinkage void thermal_interrupt(void);
37extern void reschedule_interrupt(void); 37extern asmlinkage void reschedule_interrupt(void);
38 38
39extern void invalidate_interrupt(void); 39extern asmlinkage void invalidate_interrupt(void);
40extern void invalidate_interrupt0(void); 40extern asmlinkage void invalidate_interrupt0(void);
41extern void invalidate_interrupt1(void); 41extern asmlinkage void invalidate_interrupt1(void);
42extern void invalidate_interrupt2(void); 42extern asmlinkage void invalidate_interrupt2(void);
43extern void invalidate_interrupt3(void); 43extern asmlinkage void invalidate_interrupt3(void);
44extern void invalidate_interrupt4(void); 44extern asmlinkage void invalidate_interrupt4(void);
45extern void invalidate_interrupt5(void); 45extern asmlinkage void invalidate_interrupt5(void);
46extern void invalidate_interrupt6(void); 46extern asmlinkage void invalidate_interrupt6(void);
47extern void invalidate_interrupt7(void); 47extern asmlinkage void invalidate_interrupt7(void);
48extern void invalidate_interrupt8(void); 48extern asmlinkage void invalidate_interrupt8(void);
49extern void invalidate_interrupt9(void); 49extern asmlinkage void invalidate_interrupt9(void);
50extern void invalidate_interrupt10(void); 50extern asmlinkage void invalidate_interrupt10(void);
51extern void invalidate_interrupt11(void); 51extern asmlinkage void invalidate_interrupt11(void);
52extern void invalidate_interrupt12(void); 52extern asmlinkage void invalidate_interrupt12(void);
53extern void invalidate_interrupt13(void); 53extern asmlinkage void invalidate_interrupt13(void);
54extern void invalidate_interrupt14(void); 54extern asmlinkage void invalidate_interrupt14(void);
55extern void invalidate_interrupt15(void); 55extern asmlinkage void invalidate_interrupt15(void);
56extern void invalidate_interrupt16(void); 56extern asmlinkage void invalidate_interrupt16(void);
57extern void invalidate_interrupt17(void); 57extern asmlinkage void invalidate_interrupt17(void);
58extern void invalidate_interrupt18(void); 58extern asmlinkage void invalidate_interrupt18(void);
59extern void invalidate_interrupt19(void); 59extern asmlinkage void invalidate_interrupt19(void);
60extern void invalidate_interrupt20(void); 60extern asmlinkage void invalidate_interrupt20(void);
61extern void invalidate_interrupt21(void); 61extern asmlinkage void invalidate_interrupt21(void);
62extern void invalidate_interrupt22(void); 62extern asmlinkage void invalidate_interrupt22(void);
63extern void invalidate_interrupt23(void); 63extern asmlinkage void invalidate_interrupt23(void);
64extern void invalidate_interrupt24(void); 64extern asmlinkage void invalidate_interrupt24(void);
65extern void invalidate_interrupt25(void); 65extern asmlinkage void invalidate_interrupt25(void);
66extern void invalidate_interrupt26(void); 66extern asmlinkage void invalidate_interrupt26(void);
67extern void invalidate_interrupt27(void); 67extern asmlinkage void invalidate_interrupt27(void);
68extern void invalidate_interrupt28(void); 68extern asmlinkage void invalidate_interrupt28(void);
69extern void invalidate_interrupt29(void); 69extern asmlinkage void invalidate_interrupt29(void);
70extern void invalidate_interrupt30(void); 70extern asmlinkage void invalidate_interrupt30(void);
71extern void invalidate_interrupt31(void); 71extern asmlinkage void invalidate_interrupt31(void);
72 72
73extern void irq_move_cleanup_interrupt(void); 73extern asmlinkage void irq_move_cleanup_interrupt(void);
74extern void reboot_interrupt(void); 74extern asmlinkage void reboot_interrupt(void);
75extern void threshold_interrupt(void); 75extern asmlinkage void threshold_interrupt(void);
76 76
77extern void call_function_interrupt(void); 77extern asmlinkage void call_function_interrupt(void);
78extern void call_function_single_interrupt(void); 78extern asmlinkage void call_function_single_interrupt(void);
79 79
80#ifdef CONFIG_TRACING 80#ifdef CONFIG_TRACING
81/* Interrupt handlers registered during init_IRQ */ 81/* Interrupt handlers registered during init_IRQ */
@@ -172,22 +172,18 @@ extern atomic_t irq_mis_count;
172extern void eisa_set_level_irq(unsigned int irq); 172extern void eisa_set_level_irq(unsigned int irq);
173 173
174/* SMP */ 174/* SMP */
175extern void smp_apic_timer_interrupt(struct pt_regs *); 175extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
176extern void smp_spurious_interrupt(struct pt_regs *); 176extern __visible void smp_spurious_interrupt(struct pt_regs *);
177extern void smp_x86_platform_ipi(struct pt_regs *); 177extern __visible void smp_x86_platform_ipi(struct pt_regs *);
178extern void smp_error_interrupt(struct pt_regs *); 178extern __visible void smp_error_interrupt(struct pt_regs *);
179#ifdef CONFIG_X86_IO_APIC 179#ifdef CONFIG_X86_IO_APIC
180extern asmlinkage void smp_irq_move_cleanup_interrupt(void); 180extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
181#endif 181#endif
182#ifdef CONFIG_SMP 182#ifdef CONFIG_SMP
183extern void smp_reschedule_interrupt(struct pt_regs *); 183extern __visible void smp_reschedule_interrupt(struct pt_regs *);
184extern void smp_call_function_interrupt(struct pt_regs *); 184extern __visible void smp_call_function_interrupt(struct pt_regs *);
185extern void smp_call_function_single_interrupt(struct pt_regs *); 185extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
186#ifdef CONFIG_X86_32 186extern __visible void smp_invalidate_interrupt(struct pt_regs *);
187extern void smp_invalidate_interrupt(struct pt_regs *);
188#else
189extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
190#endif
191#endif 187#endif
192 188
193extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); 189extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 2d4b5e6107cd..e42f758a0fbd 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -33,7 +33,7 @@ struct hypervisor_x86 {
33 const char *name; 33 const char *name;
34 34
35 /* Detection routine */ 35 /* Detection routine */
36 bool (*detect)(void); 36 uint32_t (*detect)(void);
37 37
38 /* Adjust CPU feature bits (run once per CPU) */ 38 /* Adjust CPU feature bits (run once per CPU) */
39 void (*set_cpu_features)(struct cpuinfo_x86 *); 39 void (*set_cpu_features)(struct cpuinfo_x86 *);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 57873beb3292..0ea10f27d613 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -33,7 +33,7 @@ extern void (*x86_platform_ipi_callback)(void);
33extern void native_init_IRQ(void); 33extern void native_init_IRQ(void);
34extern bool handle_irq(unsigned irq, struct pt_regs *regs); 34extern bool handle_irq(unsigned irq, struct pt_regs *regs);
35 35
36extern unsigned int do_IRQ(struct pt_regs *regs); 36extern __visible unsigned int do_IRQ(struct pt_regs *regs);
37 37
38/* Interrupt vector management */ 38/* Interrupt vector management */
39extern DECLARE_BITMAP(used_vectors, NR_VECTORS); 39extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 5a6d2873f80e..9454c167629f 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -49,10 +49,10 @@ typedef u8 kprobe_opcode_t;
49#define flush_insn_slot(p) do { } while (0) 49#define flush_insn_slot(p) do { } while (0)
50 50
51/* optinsn template addresses */ 51/* optinsn template addresses */
52extern kprobe_opcode_t optprobe_template_entry; 52extern __visible kprobe_opcode_t optprobe_template_entry;
53extern kprobe_opcode_t optprobe_template_val; 53extern __visible kprobe_opcode_t optprobe_template_val;
54extern kprobe_opcode_t optprobe_template_call; 54extern __visible kprobe_opcode_t optprobe_template_call;
55extern kprobe_opcode_t optprobe_template_end; 55extern __visible kprobe_opcode_t optprobe_template_end;
56#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) 56#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
57#define MAX_OPTINSN_SIZE \ 57#define MAX_OPTINSN_SIZE \
58 (((unsigned long)&optprobe_template_end - \ 58 (((unsigned long)&optprobe_template_end - \
@@ -62,7 +62,7 @@ extern kprobe_opcode_t optprobe_template_end;
62extern const int kretprobe_blacklist_size; 62extern const int kretprobe_blacklist_size;
63 63
64void arch_remove_kprobe(struct kprobe *p); 64void arch_remove_kprobe(struct kprobe *p);
65void kretprobe_trampoline(void); 65asmlinkage void kretprobe_trampoline(void);
66 66
67/* Architecture specific copy of original instruction*/ 67/* Architecture specific copy of original instruction*/
68struct arch_specific_insn { 68struct arch_specific_insn {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0a..c76ff74a98f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
286 u64 *pae_root; 286 u64 *pae_root;
287 u64 *lm_root; 287 u64 *lm_root;
288 u64 rsvd_bits_mask[2][4]; 288 u64 rsvd_bits_mask[2][4];
289 u64 bad_mt_xwr;
289 290
290 /* 291 /*
291 * Bitmap: bit set = last pte in walk 292 * Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
323 u64 global_ovf_ctrl; 324 u64 global_ovf_ctrl;
324 u64 counter_bitmask[2]; 325 u64 counter_bitmask[2];
325 u64 global_ctrl_mask; 326 u64 global_ctrl_mask;
327 u64 reserved_bits;
326 u8 version; 328 u8 version;
327 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; 329 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
328 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; 330 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
511 * instruction. 513 * instruction.
512 */ 514 */
513 bool write_fault_to_shadow_pgtable; 515 bool write_fault_to_shadow_pgtable;
516
517 /* set at EPT violation at this point */
518 unsigned long exit_qualification;
519
520 /* pv related host specific info */
521 struct {
522 bool pv_unhalted;
523 } pv;
514}; 524};
515 525
516struct kvm_lpage_info { 526struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz;
802extern u32 kvm_max_guest_tsc_khz; 812extern u32 kvm_max_guest_tsc_khz;
803 813
804enum emulation_result { 814enum emulation_result {
805 EMULATE_DONE, /* no further processing */ 815 EMULATE_DONE, /* no further processing */
806 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ 816 EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */
807 EMULATE_FAIL, /* can't emulate this instruction */ 817 EMULATE_FAIL, /* can't emulate this instruction */
808}; 818};
809 819
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 695399f2d5eb..1df115909758 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -85,26 +85,20 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
85 return ret; 85 return ret;
86} 86}
87 87
88static inline bool kvm_para_available(void) 88static inline uint32_t kvm_cpuid_base(void)
89{ 89{
90 unsigned int eax, ebx, ecx, edx;
91 char signature[13];
92
93 if (boot_cpu_data.cpuid_level < 0) 90 if (boot_cpu_data.cpuid_level < 0)
94 return false; /* So we don't blow up on old processors */ 91 return 0; /* So we don't blow up on old processors */
95 92
96 if (cpu_has_hypervisor) { 93 if (cpu_has_hypervisor)
97 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); 94 return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
98 memcpy(signature + 0, &ebx, 4);
99 memcpy(signature + 4, &ecx, 4);
100 memcpy(signature + 8, &edx, 4);
101 signature[12] = 0;
102 95
103 if (strcmp(signature, "KVMKVMKVM") == 0) 96 return 0;
104 return true; 97}
105 }
106 98
107 return false; 99static inline bool kvm_para_available(void)
100{
101 return kvm_cpuid_base() != 0;
108} 102}
109 103
110static inline unsigned int kvm_arch_para_features(void) 104static inline unsigned int kvm_arch_para_features(void)
@@ -118,10 +112,20 @@ void kvm_async_pf_task_wait(u32 token);
118void kvm_async_pf_task_wake(u32 token); 112void kvm_async_pf_task_wake(u32 token);
119u32 kvm_read_and_reset_pf_reason(void); 113u32 kvm_read_and_reset_pf_reason(void);
120extern void kvm_disable_steal_time(void); 114extern void kvm_disable_steal_time(void);
121#else 115
122#define kvm_guest_init() do { } while (0) 116#ifdef CONFIG_PARAVIRT_SPINLOCKS
117void __init kvm_spinlock_init(void);
118#else /* !CONFIG_PARAVIRT_SPINLOCKS */
119static inline void kvm_spinlock_init(void)
120{
121}
122#endif /* CONFIG_PARAVIRT_SPINLOCKS */
123
124#else /* CONFIG_KVM_GUEST */
125#define kvm_guest_init() do {} while (0)
123#define kvm_async_pf_task_wait(T) do {} while(0) 126#define kvm_async_pf_task_wait(T) do {} while(0)
124#define kvm_async_pf_task_wake(T) do {} while(0) 127#define kvm_async_pf_task_wake(T) do {} while(0)
128
125static inline u32 kvm_read_and_reset_pf_reason(void) 129static inline u32 kvm_read_and_reset_pf_reason(void)
126{ 130{
127 return 0; 131 return 0;
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 29e3093bbd21..cbe6b9e404ce 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -32,11 +32,20 @@
32#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ 32#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
33#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ 33#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
34#define MCI_STATUS_AR (1ULL<<55) /* Action required */ 34#define MCI_STATUS_AR (1ULL<<55) /* Action required */
35#define MCACOD 0xffff /* MCA Error Code */ 35
36/*
37 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
38 * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
39 * errors to indicate that errors are being filtered by hardware.
40 * We should mask out bit 12 when looking for specific signatures
41 * of uncorrected errors - so the F bit is deliberately skipped
42 * in this #define.
43 */
44#define MCACOD 0xefff /* MCA Error Code */
36 45
37/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ 46/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
38#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ 47#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
39#define MCACOD_SCRUBMSK 0xfff0 48#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */
40#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ 49#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
41#define MCACOD_DATA 0x0134 /* Data Load */ 50#define MCACOD_DATA 0x0134 /* Data Load */
42#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ 51#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
@@ -188,6 +197,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
188 const char __user *ubuf, 197 const char __user *ubuf,
189 size_t usize, loff_t *off)); 198 size_t usize, loff_t *off));
190 199
200/* Disable CMCI/polling for MCA bank claimed by firmware */
201extern void mce_disable_bank(int bank);
202
191/* 203/*
192 * Exception handler 204 * Exception handler
193 */ 205 */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 50e5c58ced23..4c019179a57d 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -59,7 +59,7 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table,
59 59
60extern int __apply_microcode_amd(struct microcode_amd *mc_amd); 60extern int __apply_microcode_amd(struct microcode_amd *mc_amd);
61extern int apply_microcode_amd(int cpu); 61extern int apply_microcode_amd(int cpu);
62extern enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size); 62extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
63 63
64#ifdef CONFIG_MICROCODE_AMD_EARLY 64#ifdef CONFIG_MICROCODE_AMD_EARLY
65#ifdef CONFIG_X86_32 65#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index cdbf36776106..be12c534fd59 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -45,22 +45,28 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
45 /* Re-load page tables */ 45 /* Re-load page tables */
46 load_cr3(next->pgd); 46 load_cr3(next->pgd);
47 47
48 /* stop flush ipis for the previous mm */ 48 /* Stop flush ipis for the previous mm */
49 cpumask_clear_cpu(cpu, mm_cpumask(prev)); 49 cpumask_clear_cpu(cpu, mm_cpumask(prev));
50 50
51 /* 51 /* Load the LDT, if the LDT is different: */
52 * load the LDT, if the LDT is different:
53 */
54 if (unlikely(prev->context.ldt != next->context.ldt)) 52 if (unlikely(prev->context.ldt != next->context.ldt))
55 load_LDT_nolock(&next->context); 53 load_LDT_nolock(&next->context);
56 } 54 }
57#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
58 else { 56 else {
59 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 57 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
60 BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); 58 BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
61 59
62 if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { 60 if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
63 /* We were in lazy tlb mode and leave_mm disabled 61 /*
62 * On established mms, the mm_cpumask is only changed
63 * from irq context, from ptep_clear_flush() while in
64 * lazy tlb mode, and here. Irqs are blocked during
65 * schedule, protecting us from simultaneous changes.
66 */
67 cpumask_set_cpu(cpu, mm_cpumask(next));
68 /*
69 * We were in lazy tlb mode and leave_mm disabled
64 * tlb flush IPI delivery. We must reload CR3 70 * tlb flush IPI delivery. We must reload CR3
65 * to make sure to use no freed page tables. 71 * to make sure to use no freed page tables.
66 */ 72 */
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
index 2c543fff241b..e7e6751648ed 100644
--- a/arch/x86/include/asm/mutex_64.h
+++ b/arch/x86/include/asm/mutex_64.h
@@ -16,6 +16,20 @@
16 * 16 *
17 * Atomically decrements @v and calls <fail_fn> if the result is negative. 17 * Atomically decrements @v and calls <fail_fn> if the result is negative.
18 */ 18 */
19#ifdef CC_HAVE_ASM_GOTO
20static inline void __mutex_fastpath_lock(atomic_t *v,
21 void (*fail_fn)(atomic_t *))
22{
23 asm volatile goto(LOCK_PREFIX " decl %0\n"
24 " jns %l[exit]\n"
25 : : "m" (v->counter)
26 : "memory", "cc"
27 : exit);
28 fail_fn(v);
29exit:
30 return;
31}
32#else
19#define __mutex_fastpath_lock(v, fail_fn) \ 33#define __mutex_fastpath_lock(v, fail_fn) \
20do { \ 34do { \
21 unsigned long dummy; \ 35 unsigned long dummy; \
@@ -32,6 +46,7 @@ do { \
32 : "rax", "rsi", "rdx", "rcx", \ 46 : "rax", "rsi", "rdx", "rcx", \
33 "r8", "r9", "r10", "r11", "memory"); \ 47 "r8", "r9", "r10", "r11", "memory"); \
34} while (0) 48} while (0)
49#endif
35 50
36/** 51/**
37 * __mutex_fastpath_lock_retval - try to take the lock by moving the count 52 * __mutex_fastpath_lock_retval - try to take the lock by moving the count
@@ -56,6 +71,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count)
56 * 71 *
57 * Atomically increments @v and calls <fail_fn> if the result is nonpositive. 72 * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
58 */ 73 */
74#ifdef CC_HAVE_ASM_GOTO
75static inline void __mutex_fastpath_unlock(atomic_t *v,
76 void (*fail_fn)(atomic_t *))
77{
78 asm volatile goto(LOCK_PREFIX " incl %0\n"
79 " jg %l[exit]\n"
80 : : "m" (v->counter)
81 : "memory", "cc"
82 : exit);
83 fail_fn(v);
84exit:
85 return;
86}
87#else
59#define __mutex_fastpath_unlock(v, fail_fn) \ 88#define __mutex_fastpath_unlock(v, fail_fn) \
60do { \ 89do { \
61 unsigned long dummy; \ 90 unsigned long dummy; \
@@ -72,6 +101,7 @@ do { \
72 : "rax", "rsi", "rdx", "rcx", \ 101 : "rax", "rsi", "rdx", "rcx", \
73 "r8", "r9", "r10", "r11", "memory"); \ 102 "r8", "r9", "r10", "r11", "memory"); \
74} while (0) 103} while (0)
104#endif
75 105
76#define __mutex_slowpath_needs_to_unlock() 1 106#define __mutex_slowpath_needs_to_unlock() 1
77 107
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index ef17af013475..f48b17df4224 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -15,6 +15,8 @@
15 */ 15 */
16#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) 16#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
17 17
18#define __START_KERNEL_map __PAGE_OFFSET
19
18#define THREAD_SIZE_ORDER 1 20#define THREAD_SIZE_ORDER 1
19#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) 21#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
20 22
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 6c896fbe21db..43dcd804ebd5 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,11 +32,6 @@
32 */ 32 */
33#define __PAGE_OFFSET _AC(0xffff880000000000, UL) 33#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
34 34
35#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
36 (CONFIG_PHYSICAL_ALIGN - 1)) & \
37 ~(CONFIG_PHYSICAL_ALIGN - 1))
38
39#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
40#define __START_KERNEL_map _AC(0xffffffff80000000, UL) 35#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
41 36
42/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ 37/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 54c97879195e..f97fbe3abb67 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -33,6 +33,11 @@
33 (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ 33 (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
34 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) 34 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
35 35
36#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \
37 CONFIG_PHYSICAL_ALIGN)
38
39#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
40
36#ifdef CONFIG_X86_64 41#ifdef CONFIG_X86_64
37#include <asm/page_64_types.h> 42#include <asm/page_64_types.h>
38#else 43#else
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cfdc9ee4c900..401f350ef71b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
712 712
713#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) 713#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
714 714
715static inline int arch_spin_is_locked(struct arch_spinlock *lock) 715static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
716 __ticket_t ticket)
716{ 717{
717 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); 718 PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket);
718} 719}
719 720
720static inline int arch_spin_is_contended(struct arch_spinlock *lock) 721static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
722 __ticket_t ticket)
721{ 723{
722 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); 724 PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
723}
724#define arch_spin_is_contended arch_spin_is_contended
725
726static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
727{
728 PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
729}
730
731static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
732 unsigned long flags)
733{
734 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
735}
736
737static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
738{
739 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
740}
741
742static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
743{
744 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
745} 725}
746 726
747#endif 727#endif
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0db1fcac668c..aab8f671b523 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -327,13 +327,15 @@ struct pv_mmu_ops {
327}; 327};
328 328
329struct arch_spinlock; 329struct arch_spinlock;
330#ifdef CONFIG_SMP
331#include <asm/spinlock_types.h>
332#else
333typedef u16 __ticket_t;
334#endif
335
330struct pv_lock_ops { 336struct pv_lock_ops {
331 int (*spin_is_locked)(struct arch_spinlock *lock); 337 struct paravirt_callee_save lock_spinning;
332 int (*spin_is_contended)(struct arch_spinlock *lock); 338 void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
333 void (*spin_lock)(struct arch_spinlock *lock);
334 void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags);
335 int (*spin_trylock)(struct arch_spinlock *lock);
336 void (*spin_unlock)(struct arch_spinlock *lock);
337}; 339};
338 340
339/* This contains all the paravirt structures: we get a convenient 341/* This contains all the paravirt structures: we get a convenient
@@ -387,7 +389,8 @@ extern struct pv_lock_ops pv_lock_ops;
387 389
388/* Simple instruction patching code. */ 390/* Simple instruction patching code. */
389#define DEF_NATIVE(ops, name, code) \ 391#define DEF_NATIVE(ops, name, code) \
390 extern const char start_##ops##_##name[], end_##ops##_##name[]; \ 392 extern const char start_##ops##_##name[] __visible, \
393 end_##ops##_##name[] __visible; \
391 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 394 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
392 395
393unsigned paravirt_patch_nop(void); 396unsigned paravirt_patch_nop(void);
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index f2b489cf1602..3bf2dd0cf61f 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif 56#endif
57 57
58#ifdef CONFIG_MEM_SOFT_DIRTY
59
60/*
61 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and
62 * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset
63 * into this range.
64 */
65#define PTE_FILE_MAX_BITS 28
66#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
67#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
68#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
69#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1)
70#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
71#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
72#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
73
74#define pte_to_pgoff(pte) \
75 ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \
76 & ((1U << PTE_FILE_BITS1) - 1))) \
77 + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \
78 & ((1U << PTE_FILE_BITS2) - 1)) \
79 << (PTE_FILE_BITS1)) \
80 + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \
81 & ((1U << PTE_FILE_BITS3) - 1)) \
82 << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
83 + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \
84 << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))
85
86#define pgoff_to_pte(off) \
87 ((pte_t) { .pte_low = \
88 ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \
89 + ((((off) >> PTE_FILE_BITS1) \
90 & ((1U << PTE_FILE_BITS2) - 1)) \
91 << PTE_FILE_SHIFT2) \
92 + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
93 & ((1U << PTE_FILE_BITS3) - 1)) \
94 << PTE_FILE_SHIFT3) \
95 + ((((off) >> \
96 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \
97 << PTE_FILE_SHIFT4) \
98 + _PAGE_FILE })
99
100#else /* CONFIG_MEM_SOFT_DIRTY */
101
58/* 102/*
59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 103 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
60 * split up the 29 bits of offset into this range: 104 * split up the 29 bits of offset into this range.
61 */ 105 */
62#define PTE_FILE_MAX_BITS 29 106#define PTE_FILE_MAX_BITS 29
63#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) 107#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
@@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
88 << PTE_FILE_SHIFT3) \ 132 << PTE_FILE_SHIFT3) \
89 + _PAGE_FILE }) 133 + _PAGE_FILE })
90 134
135#endif /* CONFIG_MEM_SOFT_DIRTY */
136
91/* Encode and de-code a swap entry */ 137/* Encode and de-code a swap entry */
92#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 138#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
93#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 139#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 4cc9f2b7cdc3..81bb91b49a88 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
179/* 179/*
180 * Bits 0, 6 and 7 are taken in the low part of the pte, 180 * Bits 0, 6 and 7 are taken in the low part of the pte,
181 * put the 32 bits of offset into the high part. 181 * put the 32 bits of offset into the high part.
182 *
183 * For soft-dirty tracking 11 bit is taken from
184 * the low part of pte as well.
182 */ 185 */
183#define pte_to_pgoff(pte) ((pte).pte_high) 186#define pte_to_pgoff(pte) ((pte).pte_high)
184#define pgoff_to_pte(off) \ 187#define pgoff_to_pte(off) \
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 7dc305a46058..8d16befdec88 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -22,7 +22,8 @@
22 * ZERO_PAGE is a global shared page that is always zero: used 22 * ZERO_PAGE is a global shared page that is always zero: used
23 * for zero-mapped memory areas etc.. 23 * for zero-mapped memory areas etc..
24 */ 24 */
25extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; 25extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
26 __visible;
26#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) 27#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
27 28
28extern spinlock_t pgd_lock; 29extern spinlock_t pgd_lock;
@@ -314,6 +315,36 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
314 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); 315 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
315} 316}
316 317
318static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
319{
320 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
321}
322
323static inline int pte_swp_soft_dirty(pte_t pte)
324{
325 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
326}
327
328static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
329{
330 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
331}
332
333static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
334{
335 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
336}
337
338static inline pte_t pte_file_mksoft_dirty(pte_t pte)
339{
340 return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
341}
342
343static inline int pte_file_soft_dirty(pte_t pte)
344{
345 return pte_flags(pte) & _PAGE_SOFT_DIRTY;
346}
347
317/* 348/*
318 * Mask out unsupported bits in a present pgprot. Non-present pgprots 349 * Mask out unsupported bits in a present pgprot. Non-present pgprots
319 * can use those bits for other purposes, so leave them be. 350 * can use those bits for other purposes, so leave them be.
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index c98ac63aae48..f4843e031131 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -61,12 +61,27 @@
61 * they do not conflict with each other. 61 * they do not conflict with each other.
62 */ 62 */
63 63
64#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
65
64#ifdef CONFIG_MEM_SOFT_DIRTY 66#ifdef CONFIG_MEM_SOFT_DIRTY
65#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) 67#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
66#else 68#else
67#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) 69#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
68#endif 70#endif
69 71
72/*
73 * Tracking soft dirty bit when a page goes to a swap is tricky.
74 * We need a bit which can be stored in pte _and_ not conflict
75 * with swap entry format. On x86 bits 6 and 7 are *not* involved
76 * into swap entry computation, but bit 6 is used for nonlinear
77 * file mapping, so we borrow bit 7 for soft dirty tracking.
78 */
79#ifdef CONFIG_MEM_SOFT_DIRTY
80#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
81#else
82#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
83#endif
84
70#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 85#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
71#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 86#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
72#else 87#else
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 24cf5aefb704..987c75ecc334 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -412,7 +412,7 @@ union irq_stack_union {
412 }; 412 };
413}; 413};
414 414
415DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); 415DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
416DECLARE_INIT_PER_CPU(irq_stack_union); 416DECLARE_INIT_PER_CPU(irq_stack_union);
417 417
418DECLARE_PER_CPU(char *, irq_stack_ptr); 418DECLARE_PER_CPU(char *, irq_stack_ptr);
@@ -942,33 +942,19 @@ extern int set_tsc_mode(unsigned int val);
942 942
943extern u16 amd_get_nb_id(int cpu); 943extern u16 amd_get_nb_id(int cpu);
944 944
945struct aperfmperf { 945static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
946 u64 aperf, mperf;
947};
948
949static inline void get_aperfmperf(struct aperfmperf *am)
950{ 946{
951 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); 947 uint32_t base, eax, signature[3];
952
953 rdmsrl(MSR_IA32_APERF, am->aperf);
954 rdmsrl(MSR_IA32_MPERF, am->mperf);
955}
956 948
957#define APERFMPERF_SHIFT 10 949 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
950 cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
958 951
959static inline 952 if (!memcmp(sig, signature, 12) &&
960unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, 953 (leaves == 0 || ((eax - base) >= leaves)))
961 struct aperfmperf *new) 954 return base;
962{ 955 }
963 u64 aperf = new->aperf - old->aperf;
964 u64 mperf = new->mperf - old->mperf;
965 unsigned long ratio = aperf;
966
967 mperf >>= APERFMPERF_SHIFT;
968 if (mperf)
969 ratio = div64_u64(aperf, mperf);
970 956
971 return ratio; 957 return 0;
972} 958}
973 959
974extern unsigned long arch_align_stack(unsigned long sp); 960extern unsigned long arch_align_stack(unsigned long sp);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..be8269b00e2a 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
93 93
94struct pvclock_vsyscall_time_info { 94struct pvclock_vsyscall_time_info {
95 struct pvclock_vcpu_time_info pvti; 95 struct pvclock_vcpu_time_info pvti;
96 u32 migrate_count;
97} __attribute__((__aligned__(SMP_CACHE_BYTES))); 96} __attribute__((__aligned__(SMP_CACHE_BYTES)));
98 97
99#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) 98#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index b7bf3505e1ec..347555492dad 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -6,6 +6,8 @@
6 6
7#define COMMAND_LINE_SIZE 2048 7#define COMMAND_LINE_SIZE 2048
8 8
9#include <linux/linkage.h>
10
9#ifdef __i386__ 11#ifdef __i386__
10 12
11#include <linux/pfn.h> 13#include <linux/pfn.h>
@@ -108,11 +110,11 @@ void *extend_brk(size_t size, size_t align);
108extern void probe_roms(void); 110extern void probe_roms(void);
109#ifdef __i386__ 111#ifdef __i386__
110 112
111void __init i386_start_kernel(void); 113asmlinkage void __init i386_start_kernel(void);
112 114
113#else 115#else
114void __init x86_64_start_kernel(char *real_mode); 116asmlinkage void __init x86_64_start_kernel(char *real_mode);
115void __init x86_64_start_reservations(char *real_mode_data); 117asmlinkage void __init x86_64_start_reservations(char *real_mode_data);
116 118
117#endif /* __i386__ */ 119#endif /* __i386__ */
118#endif /* _SETUP */ 120#endif /* _SETUP */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2f4d924fe6c9..645cad2c95ff 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -101,7 +101,7 @@ static inline void native_wbinvd(void)
101 asm volatile("wbinvd": : :"memory"); 101 asm volatile("wbinvd": : :"memory");
102} 102}
103 103
104extern void native_load_gs_index(unsigned); 104extern asmlinkage void native_load_gs_index(unsigned);
105 105
106#ifdef CONFIG_PARAVIRT 106#ifdef CONFIG_PARAVIRT
107#include <asm/paravirt.h> 107#include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692eaabab5..bf156ded74b5 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -1,11 +1,14 @@
1#ifndef _ASM_X86_SPINLOCK_H 1#ifndef _ASM_X86_SPINLOCK_H
2#define _ASM_X86_SPINLOCK_H 2#define _ASM_X86_SPINLOCK_H
3 3
4#include <linux/jump_label.h>
4#include <linux/atomic.h> 5#include <linux/atomic.h>
5#include <asm/page.h> 6#include <asm/page.h>
6#include <asm/processor.h> 7#include <asm/processor.h>
7#include <linux/compiler.h> 8#include <linux/compiler.h>
8#include <asm/paravirt.h> 9#include <asm/paravirt.h>
10#include <asm/bitops.h>
11
9/* 12/*
10 * Your basic SMP spinlocks, allowing only a single CPU anywhere 13 * Your basic SMP spinlocks, allowing only a single CPU anywhere
11 * 14 *
@@ -34,6 +37,36 @@
34# define UNLOCK_LOCK_PREFIX 37# define UNLOCK_LOCK_PREFIX
35#endif 38#endif
36 39
40/* How long a lock should spin before we consider blocking */
41#define SPIN_THRESHOLD (1 << 15)
42
43extern struct static_key paravirt_ticketlocks_enabled;
44static __always_inline bool static_key_false(struct static_key *key);
45
46#ifdef CONFIG_PARAVIRT_SPINLOCKS
47
48static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
49{
50 set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
51}
52
53#else /* !CONFIG_PARAVIRT_SPINLOCKS */
54static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock,
55 __ticket_t ticket)
56{
57}
58static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
59 __ticket_t ticket)
60{
61}
62
63#endif /* CONFIG_PARAVIRT_SPINLOCKS */
64
65static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
66{
67 return lock.tickets.head == lock.tickets.tail;
68}
69
37/* 70/*
38 * Ticket locks are conceptually two parts, one indicating the current head of 71 * Ticket locks are conceptually two parts, one indicating the current head of
39 * the queue, and the other indicating the current tail. The lock is acquired 72 * the queue, and the other indicating the current tail. The lock is acquired
@@ -47,81 +80,101 @@
47 * in the high part, because a wide xadd increment of the low part would carry 80 * in the high part, because a wide xadd increment of the low part would carry
48 * up and contaminate the high part. 81 * up and contaminate the high part.
49 */ 82 */
50static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) 83static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
51{ 84{
52 register struct __raw_tickets inc = { .tail = 1 }; 85 register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };
53 86
54 inc = xadd(&lock->tickets, inc); 87 inc = xadd(&lock->tickets, inc);
88 if (likely(inc.head == inc.tail))
89 goto out;
55 90
91 inc.tail &= ~TICKET_SLOWPATH_FLAG;
56 for (;;) { 92 for (;;) {
57 if (inc.head == inc.tail) 93 unsigned count = SPIN_THRESHOLD;
58 break; 94
59 cpu_relax(); 95 do {
60 inc.head = ACCESS_ONCE(lock->tickets.head); 96 if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
97 goto out;
98 cpu_relax();
99 } while (--count);
100 __ticket_lock_spinning(lock, inc.tail);
61 } 101 }
62 barrier(); /* make sure nothing creeps before the lock is taken */ 102out: barrier(); /* make sure nothing creeps before the lock is taken */
63} 103}
64 104
65static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) 105static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
66{ 106{
67 arch_spinlock_t old, new; 107 arch_spinlock_t old, new;
68 108
69 old.tickets = ACCESS_ONCE(lock->tickets); 109 old.tickets = ACCESS_ONCE(lock->tickets);
70 if (old.tickets.head != old.tickets.tail) 110 if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
71 return 0; 111 return 0;
72 112
73 new.head_tail = old.head_tail + (1 << TICKET_SHIFT); 113 new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
74 114
75 /* cmpxchg is a full barrier, so nothing can move before it */ 115 /* cmpxchg is a full barrier, so nothing can move before it */
76 return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; 116 return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
77} 117}
78 118
79static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) 119static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
120 arch_spinlock_t old)
80{ 121{
81 __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); 122 arch_spinlock_t new;
123
124 BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
125
126 /* Perform the unlock on the "before" copy */
127 old.tickets.head += TICKET_LOCK_INC;
128
129 /* Clear the slowpath flag */
130 new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
131
132 /*
133 * If the lock is uncontended, clear the flag - use cmpxchg in
134 * case it changes behind our back though.
135 */
136 if (new.tickets.head != new.tickets.tail ||
137 cmpxchg(&lock->head_tail, old.head_tail,
138 new.head_tail) != old.head_tail) {
139 /*
140 * Lock still has someone queued for it, so wake up an
141 * appropriate waiter.
142 */
143 __ticket_unlock_kick(lock, old.tickets.head);
144 }
82} 145}
83 146
84static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) 147static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
85{ 148{
86 struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); 149 if (TICKET_SLOWPATH_FLAG &&
150 static_key_false(&paravirt_ticketlocks_enabled)) {
151 arch_spinlock_t prev;
87 152
88 return tmp.tail != tmp.head; 153 prev = *lock;
89} 154 add_smp(&lock->tickets.head, TICKET_LOCK_INC);
90 155
91static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) 156 /* add_smp() is a full mb() */
92{
93 struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
94 157
95 return (__ticket_t)(tmp.tail - tmp.head) > 1; 158 if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
159 __ticket_unlock_slowpath(lock, prev);
160 } else
161 __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
96} 162}
97 163
98#ifndef CONFIG_PARAVIRT_SPINLOCKS
99
100static inline int arch_spin_is_locked(arch_spinlock_t *lock) 164static inline int arch_spin_is_locked(arch_spinlock_t *lock)
101{ 165{
102 return __ticket_spin_is_locked(lock); 166 struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
103}
104
105static inline int arch_spin_is_contended(arch_spinlock_t *lock)
106{
107 return __ticket_spin_is_contended(lock);
108}
109#define arch_spin_is_contended arch_spin_is_contended
110 167
111static __always_inline void arch_spin_lock(arch_spinlock_t *lock) 168 return tmp.tail != tmp.head;
112{
113 __ticket_spin_lock(lock);
114} 169}
115 170
116static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) 171static inline int arch_spin_is_contended(arch_spinlock_t *lock)
117{ 172{
118 return __ticket_spin_trylock(lock); 173 struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
119}
120 174
121static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) 175 return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
122{
123 __ticket_spin_unlock(lock);
124} 176}
177#define arch_spin_is_contended arch_spin_is_contended
125 178
126static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, 179static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
127 unsigned long flags) 180 unsigned long flags)
@@ -129,8 +182,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
129 arch_spin_lock(lock); 182 arch_spin_lock(lock);
130} 183}
131 184
132#endif /* CONFIG_PARAVIRT_SPINLOCKS */
133
134static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) 185static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
135{ 186{
136 while (arch_spin_is_locked(lock)) 187 while (arch_spin_is_locked(lock))
@@ -233,8 +284,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
233#define arch_read_relax(lock) cpu_relax() 284#define arch_read_relax(lock) cpu_relax()
234#define arch_write_relax(lock) cpu_relax() 285#define arch_write_relax(lock) cpu_relax()
235 286
236/* The {read|write|spin}_lock() on x86 are full memory barriers. */
237static inline void smp_mb__after_lock(void) { }
238#define ARCH_HAS_SMP_MB_AFTER_LOCK
239
240#endif /* _ASM_X86_SPINLOCK_H */ 287#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index ad0ad07fc006..4f1bea19945b 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -1,13 +1,17 @@
1#ifndef _ASM_X86_SPINLOCK_TYPES_H 1#ifndef _ASM_X86_SPINLOCK_TYPES_H
2#define _ASM_X86_SPINLOCK_TYPES_H 2#define _ASM_X86_SPINLOCK_TYPES_H
3 3
4#ifndef __LINUX_SPINLOCK_TYPES_H
5# error "please don't include this file directly"
6#endif
7
8#include <linux/types.h> 4#include <linux/types.h>
9 5
10#if (CONFIG_NR_CPUS < 256) 6#ifdef CONFIG_PARAVIRT_SPINLOCKS
7#define __TICKET_LOCK_INC 2
8#define TICKET_SLOWPATH_FLAG ((__ticket_t)1)
9#else
10#define __TICKET_LOCK_INC 1
11#define TICKET_SLOWPATH_FLAG ((__ticket_t)0)
12#endif
13
14#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC))
11typedef u8 __ticket_t; 15typedef u8 __ticket_t;
12typedef u16 __ticketpair_t; 16typedef u16 __ticketpair_t;
13#else 17#else
@@ -15,6 +19,8 @@ typedef u16 __ticket_t;
15typedef u32 __ticketpair_t; 19typedef u32 __ticketpair_t;
16#endif 20#endif
17 21
22#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC)
23
18#define TICKET_SHIFT (sizeof(__ticket_t) * 8) 24#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
19 25
20typedef struct arch_spinlock { 26typedef struct arch_spinlock {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 4ec45b3abba1..d7f3b3b78ac3 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -2,8 +2,8 @@
2#define _ASM_X86_SWITCH_TO_H 2#define _ASM_X86_SWITCH_TO_H
3 3
4struct task_struct; /* one of the stranger aspects of C forward declarations */ 4struct task_struct; /* one of the stranger aspects of C forward declarations */
5struct task_struct *__switch_to(struct task_struct *prev, 5__visible struct task_struct *__switch_to(struct task_struct *prev,
6 struct task_struct *next); 6 struct task_struct *next);
7struct tss_struct; 7struct tss_struct;
8void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 8void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
9 struct tss_struct *tss); 9 struct tss_struct *tss);
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
index 9d09b4073b60..05af3b31d522 100644
--- a/arch/x86/include/asm/sync_bitops.h
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -26,9 +26,9 @@
26 * Note that @nr may be almost arbitrarily large; this function is not 26 * Note that @nr may be almost arbitrarily large; this function is not
27 * restricted to acting on a single-word quantity. 27 * restricted to acting on a single-word quantity.
28 */ 28 */
29static inline void sync_set_bit(int nr, volatile unsigned long *addr) 29static inline void sync_set_bit(long nr, volatile unsigned long *addr)
30{ 30{
31 asm volatile("lock; btsl %1,%0" 31 asm volatile("lock; bts %1,%0"
32 : "+m" (ADDR) 32 : "+m" (ADDR)
33 : "Ir" (nr) 33 : "Ir" (nr)
34 : "memory"); 34 : "memory");
@@ -44,9 +44,9 @@ static inline void sync_set_bit(int nr, volatile unsigned long *addr)
44 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() 44 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
45 * in order to ensure changes are visible on other processors. 45 * in order to ensure changes are visible on other processors.
46 */ 46 */
47static inline void sync_clear_bit(int nr, volatile unsigned long *addr) 47static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
48{ 48{
49 asm volatile("lock; btrl %1,%0" 49 asm volatile("lock; btr %1,%0"
50 : "+m" (ADDR) 50 : "+m" (ADDR)
51 : "Ir" (nr) 51 : "Ir" (nr)
52 : "memory"); 52 : "memory");
@@ -61,9 +61,9 @@ static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
61 * Note that @nr may be almost arbitrarily large; this function is not 61 * Note that @nr may be almost arbitrarily large; this function is not
62 * restricted to acting on a single-word quantity. 62 * restricted to acting on a single-word quantity.
63 */ 63 */
64static inline void sync_change_bit(int nr, volatile unsigned long *addr) 64static inline void sync_change_bit(long nr, volatile unsigned long *addr)
65{ 65{
66 asm volatile("lock; btcl %1,%0" 66 asm volatile("lock; btc %1,%0"
67 : "+m" (ADDR) 67 : "+m" (ADDR)
68 : "Ir" (nr) 68 : "Ir" (nr)
69 : "memory"); 69 : "memory");
@@ -77,11 +77,11 @@ static inline void sync_change_bit(int nr, volatile unsigned long *addr)
77 * This operation is atomic and cannot be reordered. 77 * This operation is atomic and cannot be reordered.
78 * It also implies a memory barrier. 78 * It also implies a memory barrier.
79 */ 79 */
80static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) 80static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr)
81{ 81{
82 int oldbit; 82 int oldbit;
83 83
84 asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0" 84 asm volatile("lock; bts %2,%1\n\tsbbl %0,%0"
85 : "=r" (oldbit), "+m" (ADDR) 85 : "=r" (oldbit), "+m" (ADDR)
86 : "Ir" (nr) : "memory"); 86 : "Ir" (nr) : "memory");
87 return oldbit; 87 return oldbit;
@@ -95,11 +95,11 @@ static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
95 * This operation is atomic and cannot be reordered. 95 * This operation is atomic and cannot be reordered.
96 * It also implies a memory barrier. 96 * It also implies a memory barrier.
97 */ 97 */
98static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) 98static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr)
99{ 99{
100 int oldbit; 100 int oldbit;
101 101
102 asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0" 102 asm volatile("lock; btr %2,%1\n\tsbbl %0,%0"
103 : "=r" (oldbit), "+m" (ADDR) 103 : "=r" (oldbit), "+m" (ADDR)
104 : "Ir" (nr) : "memory"); 104 : "Ir" (nr) : "memory");
105 return oldbit; 105 return oldbit;
@@ -113,11 +113,11 @@ static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
113 * This operation is atomic and cannot be reordered. 113 * This operation is atomic and cannot be reordered.
114 * It also implies a memory barrier. 114 * It also implies a memory barrier.
115 */ 115 */
116static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr) 116static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr)
117{ 117{
118 int oldbit; 118 int oldbit;
119 119
120 asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0" 120 asm volatile("lock; btc %2,%1\n\tsbbl %0,%0"
121 : "=r" (oldbit), "+m" (ADDR) 121 : "=r" (oldbit), "+m" (ADDR)
122 : "Ir" (nr) : "memory"); 122 : "Ir" (nr) : "memory");
123 return oldbit; 123 return oldbit;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 2e188d68397c..aea284b41312 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -20,7 +20,8 @@
20#include <asm/thread_info.h> /* for TS_COMPAT */ 20#include <asm/thread_info.h> /* for TS_COMPAT */
21#include <asm/unistd.h> 21#include <asm/unistd.h>
22 22
23extern const unsigned long sys_call_table[]; 23typedef void (*sys_call_ptr_t)(void);
24extern const sys_call_ptr_t sys_call_table[];
24 25
25/* 26/*
26 * Only the low 32 bits of orig_ax are meaningful, so we return int. 27 * Only the low 32 bits of orig_ax are meaningful, so we return int.
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 2917a6452c49..592a6a672e07 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -24,7 +24,7 @@ asmlinkage long sys_iopl(unsigned int);
24asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); 24asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
25 25
26/* kernel/signal.c */ 26/* kernel/signal.c */
27long sys_rt_sigreturn(void); 27asmlinkage long sys_rt_sigreturn(void);
28 28
29/* kernel/tls.c */ 29/* kernel/tls.c */
30asmlinkage long sys_set_thread_area(struct user_desc __user *); 30asmlinkage long sys_set_thread_area(struct user_desc __user *);
@@ -34,7 +34,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
34#ifdef CONFIG_X86_32 34#ifdef CONFIG_X86_32
35 35
36/* kernel/signal.c */ 36/* kernel/signal.c */
37unsigned long sys_sigreturn(void); 37asmlinkage unsigned long sys_sigreturn(void);
38 38
39/* kernel/vm86_32.c */ 39/* kernel/vm86_32.c */
40asmlinkage long sys_vm86old(struct vm86_struct __user *); 40asmlinkage long sys_vm86old(struct vm86_struct __user *);
@@ -44,7 +44,7 @@ asmlinkage long sys_vm86(unsigned long, unsigned long);
44 44
45/* X86_64 only */ 45/* X86_64 only */
46/* kernel/process_64.c */ 46/* kernel/process_64.c */
47long sys_arch_prctl(int, unsigned long); 47asmlinkage long sys_arch_prctl(int, unsigned long);
48 48
49/* kernel/sys_x86_64.c */ 49/* kernel/sys_x86_64.c */
50asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 50asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h
new file mode 100644
index 000000000000..2aeb3e25579c
--- /dev/null
+++ b/arch/x86/include/asm/sysfb.h
@@ -0,0 +1,98 @@
1#ifndef _ARCH_X86_KERNEL_SYSFB_H
2#define _ARCH_X86_KERNEL_SYSFB_H
3
4/*
5 * Generic System Framebuffers on x86
6 * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2 of the License, or (at your option)
11 * any later version.
12 */
13
14#include <linux/kernel.h>
15#include <linux/platform_data/simplefb.h>
16#include <linux/screen_info.h>
17
18enum {
19 M_I17, /* 17-Inch iMac */
20 M_I20, /* 20-Inch iMac */
21 M_I20_SR, /* 20-Inch iMac (Santa Rosa) */
22 M_I24, /* 24-Inch iMac */
23 M_I24_8_1, /* 24-Inch iMac, 8,1th gen */
24 M_I24_10_1, /* 24-Inch iMac, 10,1th gen */
25 M_I27_11_1, /* 27-Inch iMac, 11,1th gen */
26 M_MINI, /* Mac Mini */
27 M_MINI_3_1, /* Mac Mini, 3,1th gen */
28 M_MINI_4_1, /* Mac Mini, 4,1th gen */
29 M_MB, /* MacBook */
30 M_MB_2, /* MacBook, 2nd rev. */
31 M_MB_3, /* MacBook, 3rd rev. */
32 M_MB_5_1, /* MacBook, 5th rev. */
33 M_MB_6_1, /* MacBook, 6th rev. */
34 M_MB_7_1, /* MacBook, 7th rev. */
35 M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */
36 M_MBA, /* MacBook Air */
37 M_MBA_3, /* Macbook Air, 3rd rev */
38 M_MBP, /* MacBook Pro */
39 M_MBP_2, /* MacBook Pro 2nd gen */
40 M_MBP_2_2, /* MacBook Pro 2,2nd gen */
41 M_MBP_SR, /* MacBook Pro (Santa Rosa) */
42 M_MBP_4, /* MacBook Pro, 4th gen */
43 M_MBP_5_1, /* MacBook Pro, 5,1th gen */
44 M_MBP_5_2, /* MacBook Pro, 5,2th gen */
45 M_MBP_5_3, /* MacBook Pro, 5,3rd gen */
46 M_MBP_6_1, /* MacBook Pro, 6,1th gen */
47 M_MBP_6_2, /* MacBook Pro, 6,2th gen */
48 M_MBP_7_1, /* MacBook Pro, 7,1th gen */
49 M_MBP_8_2, /* MacBook Pro, 8,2nd gen */
50 M_UNKNOWN /* placeholder */
51};
52
53struct efifb_dmi_info {
54 char *optname;
55 unsigned long base;
56 int stride;
57 int width;
58 int height;
59 int flags;
60};
61
62#ifdef CONFIG_EFI
63
64extern struct efifb_dmi_info efifb_dmi_list[];
65void sysfb_apply_efi_quirks(void);
66
67#else /* CONFIG_EFI */
68
69static inline void sysfb_apply_efi_quirks(void)
70{
71}
72
73#endif /* CONFIG_EFI */
74
75#ifdef CONFIG_X86_SYSFB
76
77bool parse_mode(const struct screen_info *si,
78 struct simplefb_platform_data *mode);
79int create_simplefb(const struct screen_info *si,
80 const struct simplefb_platform_data *mode);
81
82#else /* CONFIG_X86_SYSFB */
83
84static inline bool parse_mode(const struct screen_info *si,
85 struct simplefb_platform_data *mode)
86{
87 return false;
88}
89
90static inline int create_simplefb(const struct screen_info *si,
91 const struct simplefb_platform_data *mode)
92{
93 return -EINVAL;
94}
95
96#endif /* CONFIG_X86_SYSFB */
97
98#endif /* _ARCH_X86_KERNEL_SYSFB_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 095b21507b6a..d35f24e231cd 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -124,9 +124,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
124#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) 124#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
125#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) 125#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
126#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) 126#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
127
128/* indicates that pointers to the topology cpumask_t maps are valid */
129#define arch_provides_topology_pointers yes
130#endif 127#endif
131 128
132static inline void arch_fix_phys_package_id(int num, u32 slot) 129static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 88eae2aec619..7036cb60cd87 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -6,11 +6,7 @@
6#include <asm/debugreg.h> 6#include <asm/debugreg.h>
7#include <asm/siginfo.h> /* TRAP_TRACE, ... */ 7#include <asm/siginfo.h> /* TRAP_TRACE, ... */
8 8
9#ifdef CONFIG_X86_32 9#define dotraplinkage __visible
10#define dotraplinkage
11#else
12#define dotraplinkage asmlinkage
13#endif
14 10
15asmlinkage void divide_error(void); 11asmlinkage void divide_error(void);
16asmlinkage void debug(void); 12asmlinkage void debug(void);
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index c91e8b9d588b..235be70d5bb4 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -49,6 +49,7 @@ extern void tsc_init(void);
49extern void mark_tsc_unstable(char *reason); 49extern void mark_tsc_unstable(char *reason);
50extern int unsynchronized_tsc(void); 50extern int unsynchronized_tsc(void);
51extern int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern int check_tsc_disabled(void);
52extern unsigned long native_calibrate_tsc(void); 53extern unsigned long native_calibrate_tsc(void);
53 54
54extern int tsc_clocksource_reliable; 55extern int tsc_clocksource_reliable;
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5ee26875baea..5838fa911aa0 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -153,16 +153,19 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
153 * Careful: we have to cast the result to the type of the pointer 153 * Careful: we have to cast the result to the type of the pointer
154 * for sign reasons. 154 * for sign reasons.
155 * 155 *
156 * The use of %edx as the register specifier is a bit of a 156 * The use of _ASM_DX as the register specifier is a bit of a
157 * simplification, as gcc only cares about it as the starting point 157 * simplification, as gcc only cares about it as the starting point
158 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits 158 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
159 * (%ecx being the next register in gcc's x86 register sequence), and 159 * (%ecx being the next register in gcc's x86 register sequence), and
160 * %rdx on 64 bits. 160 * %rdx on 64 bits.
161 *
162 * Clang/LLVM cares about the size of the register, but still wants
163 * the base register for something that ends up being a pair.
161 */ 164 */
162#define get_user(x, ptr) \ 165#define get_user(x, ptr) \
163({ \ 166({ \
164 int __ret_gu; \ 167 int __ret_gu; \
165 register __inttype(*(ptr)) __val_gu asm("%edx"); \ 168 register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
166 __chk_user_ptr(ptr); \ 169 __chk_user_ptr(ptr); \
167 might_fault(); \ 170 might_fault(); \
168 asm volatile("call __get_user_%P3" \ 171 asm volatile("call __get_user_%P3" \
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
387#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 387#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
388#define VMX_EPT_EXTENT_CONTEXT 1 388#define VMX_EPT_EXTENT_CONTEXT 1
389#define VMX_EPT_EXTENT_GLOBAL 2 389#define VMX_EPT_EXTENT_GLOBAL 2
390#define VMX_EPT_EXTENT_SHIFT 24
390 391
391#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) 392#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
392#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) 393#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
394#define VMX_EPTP_WB_BIT (1ull << 14) 395#define VMX_EPTP_WB_BIT (1ull << 14)
395#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 396#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
396#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 397#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
398#define VMX_EPT_INVEPT_BIT (1ull << 20)
397#define VMX_EPT_AD_BIT (1ull << 21) 399#define VMX_EPT_AD_BIT (1ull << 21)
398#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 400#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
399#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 401#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index de656ac2af41..d76ac40da206 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -35,7 +35,7 @@
35 35
36#define DEFINE_VVAR(type, name) \ 36#define DEFINE_VVAR(type, name) \
37 type name \ 37 type name \
38 __attribute__((section(".vvar_" #name), aligned(16))) 38 __attribute__((section(".vvar_" #name), aligned(16))) __visible
39 39
40#define VVAR(name) (*vvaraddr_ ## name) 40#define VVAR(name) (*vvaraddr_ ## name)
41 41
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index ca842f2769ef..608a79d5a466 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -7,6 +7,7 @@ enum ipi_vector {
7 XEN_CALL_FUNCTION_SINGLE_VECTOR, 7 XEN_CALL_FUNCTION_SINGLE_VECTOR,
8 XEN_SPIN_UNLOCK_VECTOR, 8 XEN_SPIN_UNLOCK_VECTOR,
9 XEN_IRQ_WORK_VECTOR, 9 XEN_IRQ_WORK_VECTOR,
10 XEN_NMI_VECTOR,
10 11
11 XEN_NR_IPIS, 12 XEN_NR_IPIS,
12}; 13};
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 125f344f06a9..d866959e5685 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -40,21 +40,7 @@ extern struct start_info *xen_start_info;
40 40
41static inline uint32_t xen_cpuid_base(void) 41static inline uint32_t xen_cpuid_base(void)
42{ 42{
43 uint32_t base, eax, ebx, ecx, edx; 43 return hypervisor_cpuid_base("XenVMMXenVMM", 2);
44 char signature[13];
45
46 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
47 cpuid(base, &eax, &ebx, &ecx, &edx);
48 *(uint32_t *)(signature + 0) = ebx;
49 *(uint32_t *)(signature + 4) = ecx;
50 *(uint32_t *)(signature + 8) = edx;
51 signature[12] = 0;
52
53 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
54 return base;
55 }
56
57 return 0;
58} 44}
59 45
60#ifdef CONFIG_XEN 46#ifdef CONFIG_XEN
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 06fdbd987e97..94dc8ca434e0 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -23,6 +23,7 @@
23#define KVM_FEATURE_ASYNC_PF 4 23#define KVM_FEATURE_ASYNC_PF 4
24#define KVM_FEATURE_STEAL_TIME 5 24#define KVM_FEATURE_STEAL_TIME 5
25#define KVM_FEATURE_PV_EOI 6 25#define KVM_FEATURE_PV_EOI 6
26#define KVM_FEATURE_PV_UNHALT 7
26 27
27/* The last 8 bits are used to indicate how to interpret the flags field 28/* The last 8 bits are used to indicate how to interpret the flags field
28 * in pvclock structure. If no bits are set, all flags are ignored. 29 * in pvclock structure. If no bits are set, all flags are ignored.
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..0e79420376eb 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
65#define EXIT_REASON_EOI_INDUCED 45 65#define EXIT_REASON_EOI_INDUCED 45
66#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
67#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
68#define EXIT_REASON_INVEPT 50
68#define EXIT_REASON_PREEMPTION_TIMER 52 69#define EXIT_REASON_PREEMPTION_TIMER 52
69#define EXIT_REASON_WBINVD 54 70#define EXIT_REASON_WBINVD 54
70#define EXIT_REASON_XSETBV 55 71#define EXIT_REASON_XSETBV 55
@@ -106,12 +107,13 @@
106 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ 107 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
107 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ 108 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
108 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ 109 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
110 { EXIT_REASON_INVEPT, "INVEPT" }, \
111 { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
109 { EXIT_REASON_WBINVD, "WBINVD" }, \ 112 { EXIT_REASON_WBINVD, "WBINVD" }, \
110 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ 113 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
111 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 114 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
112 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 115 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
113 { EXIT_REASON_INVD, "INVD" }, \ 116 { EXIT_REASON_INVD, "INVD" }, \
114 { EXIT_REASON_INVPCID, "INVPCID" }, \ 117 { EXIT_REASON_INVPCID, "INVPCID" }
115 { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }
116 118
117#endif /* _UAPIVMX_H */ 119#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88d99ea77723..a5408b965c9d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -103,6 +103,9 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
103obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 103obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
104obj-$(CONFIG_OF) += devicetree.o 104obj-$(CONFIG_OF) += devicetree.o
105obj-$(CONFIG_UPROBES) += uprobes.o 105obj-$(CONFIG_UPROBES) += uprobes.o
106obj-y += sysfb.o
107obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o
108obj-$(CONFIG_EFI) += sysfb_efi.o
106 109
107obj-$(CONFIG_PERF_EVENTS) += perf_regs.o 110obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
108obj-$(CONFIG_TRACING) += tracepoint.o 111obj-$(CONFIG_TRACING) += tracepoint.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2627a81253ee..40c76604199f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled);
67int acpi_lapic; 67int acpi_lapic;
68int acpi_ioapic; 68int acpi_ioapic;
69int acpi_strict; 69int acpi_strict;
70int acpi_disable_cmcff;
70 71
71u8 acpi_sci_flags __initdata; 72u8 acpi_sci_flags __initdata;
72int acpi_sci_override_gsi __initdata; 73int acpi_sci_override_gsi __initdata;
@@ -141,16 +142,8 @@ static u32 irq_to_gsi(int irq)
141} 142}
142 143
143/* 144/*
144 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, 145 * This is just a simple wrapper around early_ioremap(),
145 * to map the target physical address. The problem is that set_fixmap() 146 * with sanity checks for phys == 0 and size == 0.
146 * provides a single page, and it is possible that the page is not
147 * sufficient.
148 * By using this area, we can map up to MAX_IO_APICS pages temporarily,
149 * i.e. until the next __va_range() call.
150 *
151 * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
152 * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
153 * count idx down while incrementing the phys address.
154 */ 147 */
155char *__init __acpi_map_table(unsigned long phys, unsigned long size) 148char *__init __acpi_map_table(unsigned long phys, unsigned long size)
156{ 149{
@@ -160,6 +153,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
160 153
161 return early_ioremap(phys, size); 154 return early_ioremap(phys, size);
162} 155}
156
163void __init __acpi_unmap_table(char *map, unsigned long size) 157void __init __acpi_unmap_table(char *map, unsigned long size)
164{ 158{
165 if (!map || !size) 159 if (!map || !size)
@@ -199,7 +193,7 @@ static void acpi_register_lapic(int id, u8 enabled)
199{ 193{
200 unsigned int ver = 0; 194 unsigned int ver = 0;
201 195
202 if (id >= (MAX_LOCAL_APIC-1)) { 196 if (id >= MAX_LOCAL_APIC) {
203 printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); 197 printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
204 return; 198 return;
205 } 199 }
@@ -1120,6 +1114,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1120 int ioapic; 1114 int ioapic;
1121 int ioapic_pin; 1115 int ioapic_pin;
1122 struct io_apic_irq_attr irq_attr; 1116 struct io_apic_irq_attr irq_attr;
1117 int ret;
1123 1118
1124 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 1119 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1125 return gsi; 1120 return gsi;
@@ -1149,7 +1144,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1149 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, 1144 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1150 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, 1145 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1151 polarity == ACPI_ACTIVE_HIGH ? 0 : 1); 1146 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1152 io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); 1147 ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
1148 if (ret < 0)
1149 gsi = INT_MIN;
1153 1150
1154 return gsi; 1151 return gsi;
1155} 1152}
@@ -1626,6 +1623,10 @@ static int __init parse_acpi(char *arg)
1626 /* "acpi=copy_dsdt" copys DSDT */ 1623 /* "acpi=copy_dsdt" copys DSDT */
1627 else if (strcmp(arg, "copy_dsdt") == 0) { 1624 else if (strcmp(arg, "copy_dsdt") == 0) {
1628 acpi_gbl_copy_dsdt_locally = 1; 1625 acpi_gbl_copy_dsdt_locally = 1;
1626 }
1627 /* "acpi=nocmcff" disables FF mode for corrected errors */
1628 else if (strcmp(arg, "nocmcff") == 0) {
1629 acpi_disable_cmcff = 1;
1629 } else { 1630 } else {
1630 /* Core will printk when we return error. */ 1631 /* Core will printk when we return error. */
1631 return -EINVAL; 1632 return -EINVAL;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c15cf9a25e27..15e8563e5c24 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -11,6 +11,7 @@
11#include <linux/memory.h> 11#include <linux/memory.h>
12#include <linux/stop_machine.h> 12#include <linux/stop_machine.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/kdebug.h>
14#include <asm/alternative.h> 15#include <asm/alternative.h>
15#include <asm/sections.h> 16#include <asm/sections.h>
16#include <asm/pgtable.h> 17#include <asm/pgtable.h>
@@ -596,97 +597,93 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
596 return addr; 597 return addr;
597} 598}
598 599
599/* 600static void do_sync_core(void *info)
600 * Cross-modifying kernel text with stop_machine(). 601{
601 * This code originally comes from immediate value. 602 sync_core();
602 */ 603}
603static atomic_t stop_machine_first;
604static int wrote_text;
605 604
606struct text_poke_params { 605static bool bp_patching_in_progress;
607 struct text_poke_param *params; 606static void *bp_int3_handler, *bp_int3_addr;
608 int nparams;
609};
610 607
611static int __kprobes stop_machine_text_poke(void *data) 608int poke_int3_handler(struct pt_regs *regs)
612{ 609{
613 struct text_poke_params *tpp = data; 610 /* bp_patching_in_progress */
614 struct text_poke_param *p; 611 smp_rmb();
615 int i;
616 612
617 if (atomic_xchg(&stop_machine_first, 0)) { 613 if (likely(!bp_patching_in_progress))
618 for (i = 0; i < tpp->nparams; i++) { 614 return 0;
619 p = &tpp->params[i];
620 text_poke(p->addr, p->opcode, p->len);
621 }
622 smp_wmb(); /* Make sure other cpus see that this has run */
623 wrote_text = 1;
624 } else {
625 while (!wrote_text)
626 cpu_relax();
627 smp_mb(); /* Load wrote_text before following execution */
628 }
629 615
630 for (i = 0; i < tpp->nparams; i++) { 616 if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
631 p = &tpp->params[i]; 617 return 0;
632 flush_icache_range((unsigned long)p->addr, 618
633 (unsigned long)p->addr + p->len); 619 /* set up the specified breakpoint handler */
634 } 620 regs->ip = (unsigned long) bp_int3_handler;
635 /* 621
636 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies 622 return 1;
637 * that a core serializing instruction such as "cpuid" should be
638 * executed on _each_ core before the new instruction is made visible.
639 */
640 sync_core();
641 return 0;
642}
643 623
644/**
645 * text_poke_smp - Update instructions on a live kernel on SMP
646 * @addr: address to modify
647 * @opcode: source of the copy
648 * @len: length to copy
649 *
650 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
651 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
652 * should be allowed, since stop_machine() does _not_ protect code against
653 * NMI and MCE.
654 *
655 * Note: Must be called under get_online_cpus() and text_mutex.
656 */
657void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
658{
659 struct text_poke_params tpp;
660 struct text_poke_param p;
661
662 p.addr = addr;
663 p.opcode = opcode;
664 p.len = len;
665 tpp.params = &p;
666 tpp.nparams = 1;
667 atomic_set(&stop_machine_first, 1);
668 wrote_text = 0;
669 /* Use __stop_machine() because the caller already got online_cpus. */
670 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
671 return addr;
672} 624}
673 625
674/** 626/**
675 * text_poke_smp_batch - Update instructions on a live kernel on SMP 627 * text_poke_bp() -- update instructions on live kernel on SMP
676 * @params: an array of text_poke parameters 628 * @addr: address to patch
677 * @n: the number of elements in params. 629 * @opcode: opcode of new instruction
630 * @len: length to copy
631 * @handler: address to jump to when the temporary breakpoint is hit
678 * 632 *
679 * Modify multi-byte instruction by using stop_machine() on SMP. Since the 633 * Modify multi-byte instruction by using int3 breakpoint on SMP.
680 * stop_machine() is heavy task, it is better to aggregate text_poke requests 634 * We completely avoid stop_machine() here, and achieve the
681 * and do it once if possible. 635 * synchronization using int3 breakpoint.
682 * 636 *
683 * Note: Must be called under get_online_cpus() and text_mutex. 637 * The way it is done:
638 * - add a int3 trap to the address that will be patched
639 * - sync cores
640 * - update all but the first byte of the patched range
641 * - sync cores
642 * - replace the first byte (int3) by the first byte of
643 * replacing opcode
644 * - sync cores
645 *
646 * Note: must be called under text_mutex.
684 */ 647 */
685void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) 648void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
686{ 649{
687 struct text_poke_params tpp = {.params = params, .nparams = n}; 650 unsigned char int3 = 0xcc;
651
652 bp_int3_handler = handler;
653 bp_int3_addr = (u8 *)addr + sizeof(int3);
654 bp_patching_in_progress = true;
655 /*
656 * Corresponding read barrier in int3 notifier for
657 * making sure the in_progress flags is correctly ordered wrt.
658 * patching
659 */
660 smp_wmb();
661
662 text_poke(addr, &int3, sizeof(int3));
688 663
689 atomic_set(&stop_machine_first, 1); 664 on_each_cpu(do_sync_core, NULL, 1);
690 wrote_text = 0; 665
691 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); 666 if (len - sizeof(int3) > 0) {
667 /* patch all but the first byte */
668 text_poke((char *)addr + sizeof(int3),
669 (const char *) opcode + sizeof(int3),
670 len - sizeof(int3));
671 /*
672 * According to Intel, this core syncing is very likely
673 * not necessary and we'd be safe even without it. But
674 * better safe than sorry (plus there's not only Intel).
675 */
676 on_each_cpu(do_sync_core, NULL, 1);
677 }
678
679 /* patch the first byte */
680 text_poke(addr, opcode, sizeof(int3));
681
682 on_each_cpu(do_sync_core, NULL, 1);
683
684 bp_patching_in_progress = false;
685 smp_wmb();
686
687 return addr;
692} 688}
689
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 3048ded1b598..59554dca96ec 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -20,6 +20,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, 21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
22 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, 22 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
23 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
23 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, 24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
24 {} 25 {}
25}; 26};
@@ -27,6 +28,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids);
27 28
28static const struct pci_device_id amd_nb_link_ids[] = { 29static const struct pci_device_id amd_nb_link_ids[] = {
29 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, 30 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
31 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
30 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, 32 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
31 {} 33 {}
32}; 34};
@@ -81,13 +83,20 @@ int amd_cache_northbridges(void)
81 next_northbridge(misc, amd_nb_misc_ids); 83 next_northbridge(misc, amd_nb_misc_ids);
82 node_to_amd_nb(i)->link = link = 84 node_to_amd_nb(i)->link = link =
83 next_northbridge(link, amd_nb_link_ids); 85 next_northbridge(link, amd_nb_link_ids);
84 } 86 }
85 87
88 /* GART present only on Fam15h upto model 0fh */
86 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || 89 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
87 boot_cpu_data.x86 == 0x15) 90 (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
88 amd_northbridges.flags |= AMD_NB_GART; 91 amd_northbridges.flags |= AMD_NB_GART;
89 92
90 /* 93 /*
94 * Check for L3 cache presence.
95 */
96 if (!cpuid_edx(0x80000006))
97 return 0;
98
99 /*
91 * Some CPU families support L3 Cache Index Disable. There are some 100 * Some CPU families support L3 Cache Index Disable. There are some
92 * limitations because of E382 and E388 on family 0x10. 101 * limitations because of E382 and E388 on family 0x10.
93 */ 102 */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index eca89c53a7f5..a7eb82d9b012 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -913,7 +913,7 @@ static void local_apic_timer_interrupt(void)
913 * [ if a single-CPU system runs an SMP kernel then we call the local 913 * [ if a single-CPU system runs an SMP kernel then we call the local
914 * interrupt as well. Thus we cannot inline the local irq ... ] 914 * interrupt as well. Thus we cannot inline the local irq ... ]
915 */ 915 */
916void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) 916__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
917{ 917{
918 struct pt_regs *old_regs = set_irq_regs(regs); 918 struct pt_regs *old_regs = set_irq_regs(regs);
919 919
@@ -932,7 +932,7 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
932 set_irq_regs(old_regs); 932 set_irq_regs(old_regs);
933} 933}
934 934
935void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) 935__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
936{ 936{
937 struct pt_regs *old_regs = set_irq_regs(regs); 937 struct pt_regs *old_regs = set_irq_regs(regs);
938 938
@@ -1946,14 +1946,14 @@ static inline void __smp_spurious_interrupt(void)
1946 "should never happen.\n", smp_processor_id()); 1946 "should never happen.\n", smp_processor_id());
1947} 1947}
1948 1948
1949void smp_spurious_interrupt(struct pt_regs *regs) 1949__visible void smp_spurious_interrupt(struct pt_regs *regs)
1950{ 1950{
1951 entering_irq(); 1951 entering_irq();
1952 __smp_spurious_interrupt(); 1952 __smp_spurious_interrupt();
1953 exiting_irq(); 1953 exiting_irq();
1954} 1954}
1955 1955
1956void smp_trace_spurious_interrupt(struct pt_regs *regs) 1956__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
1957{ 1957{
1958 entering_irq(); 1958 entering_irq();
1959 trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR); 1959 trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR);
@@ -2002,14 +2002,14 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
2002 2002
2003} 2003}
2004 2004
2005void smp_error_interrupt(struct pt_regs *regs) 2005__visible void smp_error_interrupt(struct pt_regs *regs)
2006{ 2006{
2007 entering_irq(); 2007 entering_irq();
2008 __smp_error_interrupt(regs); 2008 __smp_error_interrupt(regs);
2009 exiting_irq(); 2009 exiting_irq();
2010} 2010}
2011 2011
2012void smp_trace_error_interrupt(struct pt_regs *regs) 2012__visible void smp_trace_error_interrupt(struct pt_regs *regs)
2013{ 2013{
2014 entering_irq(); 2014 entering_irq();
2015 trace_error_apic_entry(ERROR_APIC_VECTOR); 2015 trace_error_apic_entry(ERROR_APIC_VECTOR);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9ed796ccc32c..e63a5bd2a78f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1534,6 +1534,11 @@ void intel_ir_io_apic_print_entries(unsigned int apic,
1534 } 1534 }
1535} 1535}
1536 1536
1537void ioapic_zap_locks(void)
1538{
1539 raw_spin_lock_init(&ioapic_lock);
1540}
1541
1537__apicdebuginit(void) print_IO_APIC(int ioapic_idx) 1542__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1538{ 1543{
1539 union IO_APIC_reg_00 reg_00; 1544 union IO_APIC_reg_00 reg_00;
@@ -3375,12 +3380,15 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3375{ 3380{
3376 unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; 3381 unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
3377 int ret; 3382 int ret;
3383 struct IO_APIC_route_entry orig_entry;
3378 3384
3379 /* Avoid redundant programming */ 3385 /* Avoid redundant programming */
3380 if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { 3386 if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
3381 pr_debug("Pin %d-%d already programmed\n", 3387 pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin);
3382 mpc_ioapic_id(ioapic_idx), pin); 3388 orig_entry = ioapic_read_entry(attr->ioapic, pin);
3383 return 0; 3389 if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity)
3390 return 0;
3391 return -EBUSY;
3384 } 3392 }
3385 ret = io_apic_setup_irq_pin(irq, node, attr); 3393 ret = io_apic_setup_irq_pin(irq, node, attr);
3386 if (!ret) 3394 if (!ret)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 53a4e2744846..3ab03430211d 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -392,7 +392,7 @@ static struct cpuidle_device apm_cpuidle_device;
392/* 392/*
393 * Local variables 393 * Local variables
394 */ 394 */
395static struct { 395__visible struct {
396 unsigned long offset; 396 unsigned long offset;
397 unsigned short segment; 397 unsigned short segment;
398} apm_bios_entry; 398} apm_bios_entry;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f654ecefea5b..903a264af981 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -66,8 +66,8 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
66 * performance at the same time.. 66 * performance at the same time..
67 */ 67 */
68 68
69extern void vide(void); 69extern __visible void vide(void);
70__asm__(".align 4\nvide: ret"); 70__asm__(".globl vide\n\t.align 4\nvide: ret");
71 71
72static void init_amd_k5(struct cpuinfo_x86 *c) 72static void init_amd_k5(struct cpuinfo_x86 *c)
73{ 73{
@@ -512,7 +512,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
512 512
513static const int amd_erratum_383[]; 513static const int amd_erratum_383[];
514static const int amd_erratum_400[]; 514static const int amd_erratum_400[];
515static bool cpu_has_amd_erratum(const int *erratum); 515static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
516 516
517static void init_amd(struct cpuinfo_x86 *c) 517static void init_amd(struct cpuinfo_x86 *c)
518{ 518{
@@ -729,11 +729,11 @@ static void init_amd(struct cpuinfo_x86 *c)
729 value &= ~(1ULL << 24); 729 value &= ~(1ULL << 24);
730 wrmsrl_safe(MSR_AMD64_BU_CFG2, value); 730 wrmsrl_safe(MSR_AMD64_BU_CFG2, value);
731 731
732 if (cpu_has_amd_erratum(amd_erratum_383)) 732 if (cpu_has_amd_erratum(c, amd_erratum_383))
733 set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); 733 set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
734 } 734 }
735 735
736 if (cpu_has_amd_erratum(amd_erratum_400)) 736 if (cpu_has_amd_erratum(c, amd_erratum_400))
737 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); 737 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
738 738
739 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 739 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
@@ -878,23 +878,13 @@ static const int amd_erratum_400[] =
878static const int amd_erratum_383[] = 878static const int amd_erratum_383[] =
879 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); 879 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
880 880
881static bool cpu_has_amd_erratum(const int *erratum) 881
882static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
882{ 883{
883 struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
884 int osvw_id = *erratum++; 884 int osvw_id = *erratum++;
885 u32 range; 885 u32 range;
886 u32 ms; 886 u32 ms;
887 887
888 /*
889 * If called early enough that current_cpu_data hasn't been initialized
890 * yet, fall back to boot_cpu_data.
891 */
892 if (cpu->x86 == 0)
893 cpu = &boot_cpu_data;
894
895 if (cpu->x86_vendor != X86_VENDOR_AMD)
896 return false;
897
898 if (osvw_id >= 0 && osvw_id < 65536 && 888 if (osvw_id >= 0 && osvw_id < 65536 &&
899 cpu_has(cpu, X86_FEATURE_OSVW)) { 889 cpu_has(cpu, X86_FEATURE_OSVW)) {
900 u64 osvw_len; 890 u64 osvw_len;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 25eb2747b063..2793d1f095a2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1076,7 +1076,7 @@ struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
1076 (unsigned long) debug_idt_table }; 1076 (unsigned long) debug_idt_table };
1077 1077
1078DEFINE_PER_CPU_FIRST(union irq_stack_union, 1078DEFINE_PER_CPU_FIRST(union irq_stack_union,
1079 irq_stack_union) __aligned(PAGE_SIZE); 1079 irq_stack_union) __aligned(PAGE_SIZE) __visible;
1080 1080
1081/* 1081/*
1082 * The following four percpu variables are hot. Align current_task to 1082 * The following four percpu variables are hot. Align current_task to
@@ -1093,7 +1093,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
1093DEFINE_PER_CPU(char *, irq_stack_ptr) = 1093DEFINE_PER_CPU(char *, irq_stack_ptr) =
1094 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; 1094 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
1095 1095
1096DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1096DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
1097 1097
1098DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1098DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1099 1099
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 87279212d318..36ce402a3fa5 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -25,11 +25,6 @@
25#include <asm/processor.h> 25#include <asm/processor.h>
26#include <asm/hypervisor.h> 26#include <asm/hypervisor.h>
27 27
28/*
29 * Hypervisor detect order. This is specified explicitly here because
30 * some hypervisors might implement compatibility modes for other
31 * hypervisors and therefore need to be detected in specific sequence.
32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] = 28static const __initconst struct hypervisor_x86 * const hypervisors[] =
34{ 29{
35#ifdef CONFIG_XEN_PVHVM 30#ifdef CONFIG_XEN_PVHVM
@@ -49,15 +44,19 @@ static inline void __init
49detect_hypervisor_vendor(void) 44detect_hypervisor_vendor(void)
50{ 45{
51 const struct hypervisor_x86 *h, * const *p; 46 const struct hypervisor_x86 *h, * const *p;
47 uint32_t pri, max_pri = 0;
52 48
53 for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { 49 for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
54 h = *p; 50 h = *p;
55 if (h->detect()) { 51 pri = h->detect();
52 if (pri != 0 && pri > max_pri) {
53 max_pri = pri;
56 x86_hyper = h; 54 x86_hyper = h;
57 printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
58 break;
59 } 55 }
60 } 56 }
57
58 if (max_pri)
59 printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
61} 60}
62 61
63void init_hypervisor(struct cpuinfo_x86 *c) 62void init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 5b7d4fa5d3b7..09edd0b65fef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void); 25struct dentry *mce_get_debugfs_dir(void);
26 26
27extern struct mce_bank *mce_banks; 27extern struct mce_bank *mce_banks;
28extern mce_banks_t mce_banks_ce_disabled;
28 29
29#ifdef CONFIG_X86_MCE_INTEL 30#ifdef CONFIG_X86_MCE_INTEL
30unsigned long mce_intel_adjust_timer(unsigned long interval); 31unsigned long mce_intel_adjust_timer(unsigned long interval);
31void mce_intel_cmci_poll(void); 32void mce_intel_cmci_poll(void);
32void mce_intel_hcpu_update(unsigned long cpu); 33void mce_intel_hcpu_update(unsigned long cpu);
34void cmci_disable_bank(int bank);
33#else 35#else
34# define mce_intel_adjust_timer mce_adjust_timer_default 36# define mce_intel_adjust_timer mce_adjust_timer_default
35static inline void mce_intel_cmci_poll(void) { } 37static inline void mce_intel_cmci_poll(void) { }
36static inline void mce_intel_hcpu_update(unsigned long cpu) { } 38static inline void mce_intel_hcpu_update(unsigned long cpu) { }
39static inline void cmci_disable_bank(int bank) { }
37#endif 40#endif
38 41
39void mce_timer_kick(unsigned long interval); 42void mce_timer_kick(unsigned long interval);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index e2703520d120..c370e1c4468b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -111,8 +111,8 @@ static struct severity {
111#ifdef CONFIG_MEMORY_FAILURE 111#ifdef CONFIG_MEMORY_FAILURE
112 MCESEV( 112 MCESEV(
113 KEEP, "Action required but unaffected thread is continuable", 113 KEEP, "Action required but unaffected thread is continuable",
114 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR), 114 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
115 MCGMASK(MCG_STATUS_RIPV, MCG_STATUS_RIPV) 115 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
116 ), 116 ),
117 MCESEV( 117 MCESEV(
118 AR, "Action required: data load error in a user process", 118 AR, "Action required: data load error in a user process",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 87a65c939bcd..b3218cdee95f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -97,6 +97,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
97 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 97 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
98}; 98};
99 99
100/*
101 * MCA banks controlled through firmware first for corrected errors.
102 * This is a global list of banks for which we won't enable CMCI and we
103 * won't poll. Firmware controls these banks and is responsible for
104 * reporting corrected errors through GHES. Uncorrected/recoverable
105 * errors are still notified through a machine check.
106 */
107mce_banks_t mce_banks_ce_disabled;
108
100static DEFINE_PER_CPU(struct work_struct, mce_work); 109static DEFINE_PER_CPU(struct work_struct, mce_work);
101 110
102static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 111static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
@@ -1935,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = {
1935 &mce_chrdev_ops, 1944 &mce_chrdev_ops,
1936}; 1945};
1937 1946
1947static void __mce_disable_bank(void *arg)
1948{
1949 int bank = *((int *)arg);
1950 __clear_bit(bank, __get_cpu_var(mce_poll_banks));
1951 cmci_disable_bank(bank);
1952}
1953
1954void mce_disable_bank(int bank)
1955{
1956 if (bank >= mca_cfg.banks) {
1957 pr_warn(FW_BUG
1958 "Ignoring request to disable invalid MCA bank %d.\n",
1959 bank);
1960 return;
1961 }
1962 set_bit(bank, mce_banks_ce_disabled);
1963 on_each_cpu(__mce_disable_bank, &bank, 1);
1964}
1965
1938/* 1966/*
1939 * mce=off Disables machine check 1967 * mce=off Disables machine check
1940 * mce=no_cmci Disables CMCI 1968 * mce=no_cmci Disables CMCI
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index d56405309dc1..4cfe0458ca66 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -203,6 +203,10 @@ static void cmci_discover(int banks)
203 if (test_bit(i, owned)) 203 if (test_bit(i, owned))
204 continue; 204 continue;
205 205
206 /* Skip banks in firmware first mode */
207 if (test_bit(i, mce_banks_ce_disabled))
208 continue;
209
206 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 210 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
207 211
208 /* Already owned by someone else? */ 212 /* Already owned by someone else? */
@@ -271,6 +275,19 @@ void cmci_recheck(void)
271 local_irq_restore(flags); 275 local_irq_restore(flags);
272} 276}
273 277
278/* Caller must hold the lock on cmci_discover_lock */
279static void __cmci_disable_bank(int bank)
280{
281 u64 val;
282
283 if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
284 return;
285 rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
286 val &= ~MCI_CTL2_CMCI_EN;
287 wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
288 __clear_bit(bank, __get_cpu_var(mce_banks_owned));
289}
290
274/* 291/*
275 * Disable CMCI on this CPU for all banks it owns when it goes down. 292 * Disable CMCI on this CPU for all banks it owns when it goes down.
276 * This allows other CPUs to claim the banks on rediscovery. 293 * This allows other CPUs to claim the banks on rediscovery.
@@ -280,20 +297,12 @@ void cmci_clear(void)
280 unsigned long flags; 297 unsigned long flags;
281 int i; 298 int i;
282 int banks; 299 int banks;
283 u64 val;
284 300
285 if (!cmci_supported(&banks)) 301 if (!cmci_supported(&banks))
286 return; 302 return;
287 raw_spin_lock_irqsave(&cmci_discover_lock, flags); 303 raw_spin_lock_irqsave(&cmci_discover_lock, flags);
288 for (i = 0; i < banks; i++) { 304 for (i = 0; i < banks; i++)
289 if (!test_bit(i, __get_cpu_var(mce_banks_owned))) 305 __cmci_disable_bank(i);
290 continue;
291 /* Disable CMCI */
292 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
293 val &= ~MCI_CTL2_CMCI_EN;
294 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
295 __clear_bit(i, __get_cpu_var(mce_banks_owned));
296 }
297 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 306 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
298} 307}
299 308
@@ -327,6 +336,19 @@ void cmci_reenable(void)
327 cmci_discover(banks); 336 cmci_discover(banks);
328} 337}
329 338
339void cmci_disable_bank(int bank)
340{
341 int banks;
342 unsigned long flags;
343
344 if (!cmci_supported(&banks))
345 return;
346
347 raw_spin_lock_irqsave(&cmci_discover_lock, flags);
348 __cmci_disable_bank(bank);
349 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
350}
351
330static void intel_init_cmci(void) 352static void intel_init_cmci(void)
331{ 353{
332 int banks; 354 int banks;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8f4be53ea04b..71a39f3621ba 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -27,20 +27,23 @@
27struct ms_hyperv_info ms_hyperv; 27struct ms_hyperv_info ms_hyperv;
28EXPORT_SYMBOL_GPL(ms_hyperv); 28EXPORT_SYMBOL_GPL(ms_hyperv);
29 29
30static bool __init ms_hyperv_platform(void) 30static uint32_t __init ms_hyperv_platform(void)
31{ 31{
32 u32 eax; 32 u32 eax;
33 u32 hyp_signature[3]; 33 u32 hyp_signature[3];
34 34
35 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) 35 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
36 return false; 36 return 0;
37 37
38 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, 38 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
39 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); 39 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
40 40
41 return eax >= HYPERV_CPUID_MIN && 41 if (eax >= HYPERV_CPUID_MIN &&
42 eax <= HYPERV_CPUID_MAX && 42 eax <= HYPERV_CPUID_MAX &&
43 !memcmp("Microsoft Hv", hyp_signature, 12); 43 !memcmp("Microsoft Hv", hyp_signature, 12))
44 return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
45
46 return 0;
44} 47}
45 48
46static cycle_t read_hv_clock(struct clocksource *arg) 49static cycle_t read_hv_clock(struct clocksource *arg)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a7c7305030cc..8355c84b9729 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1884,6 +1884,7 @@ static struct pmu pmu = {
1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1885{ 1885{
1886 userpg->cap_usr_time = 0; 1886 userpg->cap_usr_time = 0;
1887 userpg->cap_usr_time_zero = 0;
1887 userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; 1888 userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
1888 userpg->pmc_width = x86_pmu.cntval_bits; 1889 userpg->pmc_width = x86_pmu.cntval_bits;
1889 1890
@@ -1897,6 +1898,11 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1897 userpg->time_mult = this_cpu_read(cyc2ns); 1898 userpg->time_mult = this_cpu_read(cyc2ns);
1898 userpg->time_shift = CYC2NS_SCALE_FACTOR; 1899 userpg->time_shift = CYC2NS_SCALE_FACTOR;
1899 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; 1900 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
1901
1902 if (sched_clock_stable && !check_tsc_disabled()) {
1903 userpg->cap_usr_time_zero = 1;
1904 userpg->time_zero = this_cpu_read(cyc2ns_offset);
1905 }
1900} 1906}
1901 1907
1902/* 1908/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 97e557bc4c91..cc16faae0538 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -641,6 +641,8 @@ extern struct event_constraint intel_core2_pebs_event_constraints[];
641 641
642extern struct event_constraint intel_atom_pebs_event_constraints[]; 642extern struct event_constraint intel_atom_pebs_event_constraints[];
643 643
644extern struct event_constraint intel_slm_pebs_event_constraints[];
645
644extern struct event_constraint intel_nehalem_pebs_event_constraints[]; 646extern struct event_constraint intel_nehalem_pebs_event_constraints[];
645 647
646extern struct event_constraint intel_westmere_pebs_event_constraints[]; 648extern struct event_constraint intel_westmere_pebs_event_constraints[];
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 4cbe03287b08..beeb7cc07044 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -347,8 +347,7 @@ static struct amd_nb *amd_alloc_nb(int cpu)
347 struct amd_nb *nb; 347 struct amd_nb *nb;
348 int i; 348 int i;
349 349
350 nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, 350 nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
351 cpu_to_node(cpu));
352 if (!nb) 351 if (!nb)
353 return NULL; 352 return NULL;
354 353
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index fbc9210b45bc..0abf6742a8b0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -81,7 +81,8 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
81 81
82static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = 82static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
83{ 83{
84 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), 84 /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
85 INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
85 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), 86 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
86 EVENT_EXTRA_END 87 EVENT_EXTRA_END
87}; 88};
@@ -143,8 +144,9 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
143 144
144static struct extra_reg intel_westmere_extra_regs[] __read_mostly = 145static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
145{ 146{
146 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), 147 /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
147 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), 148 INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
149 INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
148 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), 150 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
149 EVENT_EXTRA_END 151 EVENT_EXTRA_END
150}; 152};
@@ -162,16 +164,27 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
162 EVENT_CONSTRAINT_END 164 EVENT_CONSTRAINT_END
163}; 165};
164 166
167static struct event_constraint intel_slm_event_constraints[] __read_mostly =
168{
169 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
170 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
171 FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */
172 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
173 EVENT_CONSTRAINT_END
174};
175
165static struct extra_reg intel_snb_extra_regs[] __read_mostly = { 176static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
166 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), 177 /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
167 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), 178 INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
179 INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
168 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), 180 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
169 EVENT_EXTRA_END 181 EVENT_EXTRA_END
170}; 182};
171 183
172static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { 184static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
173 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), 185 /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
174 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), 186 INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
187 INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
175 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), 188 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
176 EVENT_EXTRA_END 189 EVENT_EXTRA_END
177}; 190};
@@ -882,6 +895,140 @@ static __initconst const u64 atom_hw_cache_event_ids
882 }, 895 },
883}; 896};
884 897
898static struct extra_reg intel_slm_extra_regs[] __read_mostly =
899{
900 /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
901 INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffff, RSP_0),
902 INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffff, RSP_1),
903 EVENT_EXTRA_END
904};
905
906#define SLM_DMND_READ SNB_DMND_DATA_RD
907#define SLM_DMND_WRITE SNB_DMND_RFO
908#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
909
910#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
911#define SLM_LLC_ACCESS SNB_RESP_ANY
912#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM)
913
914static __initconst const u64 slm_hw_cache_extra_regs
915 [PERF_COUNT_HW_CACHE_MAX]
916 [PERF_COUNT_HW_CACHE_OP_MAX]
917 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
918{
919 [ C(LL ) ] = {
920 [ C(OP_READ) ] = {
921 [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
922 [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS,
923 },
924 [ C(OP_WRITE) ] = {
925 [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
926 [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS,
927 },
928 [ C(OP_PREFETCH) ] = {
929 [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
930 [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
931 },
932 },
933};
934
935static __initconst const u64 slm_hw_cache_event_ids
936 [PERF_COUNT_HW_CACHE_MAX]
937 [PERF_COUNT_HW_CACHE_OP_MAX]
938 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
939{
940 [ C(L1D) ] = {
941 [ C(OP_READ) ] = {
942 [ C(RESULT_ACCESS) ] = 0,
943 [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */
944 },
945 [ C(OP_WRITE) ] = {
946 [ C(RESULT_ACCESS) ] = 0,
947 [ C(RESULT_MISS) ] = 0,
948 },
949 [ C(OP_PREFETCH) ] = {
950 [ C(RESULT_ACCESS) ] = 0,
951 [ C(RESULT_MISS) ] = 0,
952 },
953 },
954 [ C(L1I ) ] = {
955 [ C(OP_READ) ] = {
956 [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
957 [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */
958 },
959 [ C(OP_WRITE) ] = {
960 [ C(RESULT_ACCESS) ] = -1,
961 [ C(RESULT_MISS) ] = -1,
962 },
963 [ C(OP_PREFETCH) ] = {
964 [ C(RESULT_ACCESS) ] = 0,
965 [ C(RESULT_MISS) ] = 0,
966 },
967 },
968 [ C(LL ) ] = {
969 [ C(OP_READ) ] = {
970 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
971 [ C(RESULT_ACCESS) ] = 0x01b7,
972 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
973 [ C(RESULT_MISS) ] = 0x01b7,
974 },
975 [ C(OP_WRITE) ] = {
976 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
977 [ C(RESULT_ACCESS) ] = 0x01b7,
978 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
979 [ C(RESULT_MISS) ] = 0x01b7,
980 },
981 [ C(OP_PREFETCH) ] = {
982 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
983 [ C(RESULT_ACCESS) ] = 0x01b7,
984 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
985 [ C(RESULT_MISS) ] = 0x01b7,
986 },
987 },
988 [ C(DTLB) ] = {
989 [ C(OP_READ) ] = {
990 [ C(RESULT_ACCESS) ] = 0,
991 [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */
992 },
993 [ C(OP_WRITE) ] = {
994 [ C(RESULT_ACCESS) ] = 0,
995 [ C(RESULT_MISS) ] = 0,
996 },
997 [ C(OP_PREFETCH) ] = {
998 [ C(RESULT_ACCESS) ] = 0,
999 [ C(RESULT_MISS) ] = 0,
1000 },
1001 },
1002 [ C(ITLB) ] = {
1003 [ C(OP_READ) ] = {
1004 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
1005 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
1006 },
1007 [ C(OP_WRITE) ] = {
1008 [ C(RESULT_ACCESS) ] = -1,
1009 [ C(RESULT_MISS) ] = -1,
1010 },
1011 [ C(OP_PREFETCH) ] = {
1012 [ C(RESULT_ACCESS) ] = -1,
1013 [ C(RESULT_MISS) ] = -1,
1014 },
1015 },
1016 [ C(BPU ) ] = {
1017 [ C(OP_READ) ] = {
1018 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
1019 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
1020 },
1021 [ C(OP_WRITE) ] = {
1022 [ C(RESULT_ACCESS) ] = -1,
1023 [ C(RESULT_MISS) ] = -1,
1024 },
1025 [ C(OP_PREFETCH) ] = {
1026 [ C(RESULT_ACCESS) ] = -1,
1027 [ C(RESULT_MISS) ] = -1,
1028 },
1029 },
1030};
1031
885static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) 1032static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
886{ 1033{
887 /* user explicitly requested branch sampling */ 1034 /* user explicitly requested branch sampling */
@@ -1301,11 +1448,11 @@ static void intel_fixup_er(struct perf_event *event, int idx)
1301 1448
1302 if (idx == EXTRA_REG_RSP_0) { 1449 if (idx == EXTRA_REG_RSP_0) {
1303 event->hw.config &= ~INTEL_ARCH_EVENT_MASK; 1450 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1304 event->hw.config |= 0x01b7; 1451 event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
1305 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; 1452 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
1306 } else if (idx == EXTRA_REG_RSP_1) { 1453 } else if (idx == EXTRA_REG_RSP_1) {
1307 event->hw.config &= ~INTEL_ARCH_EVENT_MASK; 1454 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1308 event->hw.config |= 0x01bb; 1455 event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
1309 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; 1456 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
1310 } 1457 }
1311} 1458}
@@ -2176,6 +2323,21 @@ __init int intel_pmu_init(void)
2176 pr_cont("Atom events, "); 2323 pr_cont("Atom events, ");
2177 break; 2324 break;
2178 2325
2326 case 55: /* Atom 22nm "Silvermont" */
2327 memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
2328 sizeof(hw_cache_event_ids));
2329 memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
2330 sizeof(hw_cache_extra_regs));
2331
2332 intel_pmu_lbr_init_atom();
2333
2334 x86_pmu.event_constraints = intel_slm_event_constraints;
2335 x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
2336 x86_pmu.extra_regs = intel_slm_extra_regs;
2337 x86_pmu.er_flags |= ERF_HAS_RSP_1;
2338 pr_cont("Silvermont events, ");
2339 break;
2340
2179 case 37: /* 32 nm nehalem, "Clarkdale" */ 2341 case 37: /* 32 nm nehalem, "Clarkdale" */
2180 case 44: /* 32 nm nehalem, "Gulftown" */ 2342 case 44: /* 32 nm nehalem, "Gulftown" */
2181 case 47: /* 32 nm Xeon E7 */ 2343 case 47: /* 32 nm Xeon E7 */
@@ -2270,6 +2432,7 @@ __init int intel_pmu_init(void)
2270 case 70: 2432 case 70:
2271 case 71: 2433 case 71:
2272 case 63: 2434 case 63:
2435 case 69:
2273 x86_pmu.late_ack = true; 2436 x86_pmu.late_ack = true;
2274 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); 2437 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
2275 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); 2438 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 3065c57a63c1..63438aad177f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -224,7 +224,7 @@ static int alloc_pebs_buffer(int cpu)
224 if (!x86_pmu.pebs) 224 if (!x86_pmu.pebs)
225 return 0; 225 return 0;
226 226
227 buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); 227 buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
228 if (unlikely(!buffer)) 228 if (unlikely(!buffer))
229 return -ENOMEM; 229 return -ENOMEM;
230 230
@@ -262,7 +262,7 @@ static int alloc_bts_buffer(int cpu)
262 if (!x86_pmu.bts) 262 if (!x86_pmu.bts)
263 return 0; 263 return 0;
264 264
265 buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); 265 buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node);
266 if (unlikely(!buffer)) 266 if (unlikely(!buffer))
267 return -ENOMEM; 267 return -ENOMEM;
268 268
@@ -295,7 +295,7 @@ static int alloc_ds_buffer(int cpu)
295 int node = cpu_to_node(cpu); 295 int node = cpu_to_node(cpu);
296 struct debug_store *ds; 296 struct debug_store *ds;
297 297
298 ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); 298 ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
299 if (unlikely(!ds)) 299 if (unlikely(!ds))
300 return -ENOMEM; 300 return -ENOMEM;
301 301
@@ -517,6 +517,32 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
517 EVENT_CONSTRAINT_END 517 EVENT_CONSTRAINT_END
518}; 518};
519 519
520struct event_constraint intel_slm_pebs_event_constraints[] = {
521 INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */
522 INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */
523 INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */
524 INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */
525 INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */
526 INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */
527 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */
528 INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */
529 INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */
530 INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */
531 INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */
532 INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */
533 INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */
534 INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */
535 INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */
536 INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */
537 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */
538 INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */
539 INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */
540 INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */
541 INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */
542 INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */
543 EVENT_CONSTRAINT_END
544};
545
520struct event_constraint intel_nehalem_pebs_event_constraints[] = { 546struct event_constraint intel_nehalem_pebs_event_constraints[] = {
521 INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ 547 INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
522 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ 548 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index cad791dbde95..fd8011ed4dcd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -6,6 +6,8 @@ static struct intel_uncore_type **pci_uncores = empty_uncore;
6/* pci bus to socket mapping */ 6/* pci bus to socket mapping */
7static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; 7static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
8 8
9static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
10
9static DEFINE_RAW_SPINLOCK(uncore_box_lock); 11static DEFINE_RAW_SPINLOCK(uncore_box_lock);
10 12
11/* mask of cpus that collect uncore events */ 13/* mask of cpus that collect uncore events */
@@ -45,6 +47,24 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
45DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); 47DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
46DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); 48DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
47DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); 49DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
50DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
51DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
52DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
53DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
54DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
55DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
56DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
57DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
58DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
59DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
60DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
61DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
62DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
63DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
64DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
65DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
66DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
67DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
48 68
49static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) 69static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
50{ 70{
@@ -281,7 +301,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = {
281}; 301};
282 302
283static struct attribute *snbep_uncore_pcu_formats_attr[] = { 303static struct attribute *snbep_uncore_pcu_formats_attr[] = {
284 &format_attr_event.attr, 304 &format_attr_event_ext.attr,
285 &format_attr_occ_sel.attr, 305 &format_attr_occ_sel.attr,
286 &format_attr_edge.attr, 306 &format_attr_edge.attr,
287 &format_attr_inv.attr, 307 &format_attr_inv.attr,
@@ -301,6 +321,24 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = {
301 &format_attr_edge.attr, 321 &format_attr_edge.attr,
302 &format_attr_inv.attr, 322 &format_attr_inv.attr,
303 &format_attr_thresh8.attr, 323 &format_attr_thresh8.attr,
324 &format_attr_match_rds.attr,
325 &format_attr_match_rnid30.attr,
326 &format_attr_match_rnid4.attr,
327 &format_attr_match_dnid.attr,
328 &format_attr_match_mc.attr,
329 &format_attr_match_opc.attr,
330 &format_attr_match_vnw.attr,
331 &format_attr_match0.attr,
332 &format_attr_match1.attr,
333 &format_attr_mask_rds.attr,
334 &format_attr_mask_rnid30.attr,
335 &format_attr_mask_rnid4.attr,
336 &format_attr_mask_dnid.attr,
337 &format_attr_mask_mc.attr,
338 &format_attr_mask_opc.attr,
339 &format_attr_mask_vnw.attr,
340 &format_attr_mask0.attr,
341 &format_attr_mask1.attr,
304 NULL, 342 NULL,
305}; 343};
306 344
@@ -314,8 +352,8 @@ static struct uncore_event_desc snbep_uncore_imc_events[] = {
314static struct uncore_event_desc snbep_uncore_qpi_events[] = { 352static struct uncore_event_desc snbep_uncore_qpi_events[] = {
315 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), 353 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
316 INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), 354 INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
317 INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), 355 INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
318 INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), 356 INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
319 { /* end: all zeroes */ }, 357 { /* end: all zeroes */ },
320}; 358};
321 359
@@ -356,13 +394,16 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = {
356 SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), 394 SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
357}; 395};
358 396
397#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
398 .init_box = snbep_uncore_pci_init_box, \
399 .disable_box = snbep_uncore_pci_disable_box, \
400 .enable_box = snbep_uncore_pci_enable_box, \
401 .disable_event = snbep_uncore_pci_disable_event, \
402 .read_counter = snbep_uncore_pci_read_counter
403
359static struct intel_uncore_ops snbep_uncore_pci_ops = { 404static struct intel_uncore_ops snbep_uncore_pci_ops = {
360 .init_box = snbep_uncore_pci_init_box, 405 SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
361 .disable_box = snbep_uncore_pci_disable_box, 406 .enable_event = snbep_uncore_pci_enable_event, \
362 .enable_box = snbep_uncore_pci_enable_box,
363 .disable_event = snbep_uncore_pci_disable_event,
364 .enable_event = snbep_uncore_pci_enable_event,
365 .read_counter = snbep_uncore_pci_read_counter,
366}; 407};
367 408
368static struct event_constraint snbep_uncore_cbox_constraints[] = { 409static struct event_constraint snbep_uncore_cbox_constraints[] = {
@@ -726,6 +767,61 @@ static struct intel_uncore_type *snbep_msr_uncores[] = {
726 NULL, 767 NULL,
727}; 768};
728 769
770enum {
771 SNBEP_PCI_QPI_PORT0_FILTER,
772 SNBEP_PCI_QPI_PORT1_FILTER,
773};
774
775static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
776{
777 struct hw_perf_event *hwc = &event->hw;
778 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
779 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
780
781 if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
782 reg1->idx = 0;
783 reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
784 reg1->config = event->attr.config1;
785 reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
786 reg2->config = event->attr.config2;
787 }
788 return 0;
789}
790
791static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
792{
793 struct pci_dev *pdev = box->pci_dev;
794 struct hw_perf_event *hwc = &event->hw;
795 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
796 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
797
798 if (reg1->idx != EXTRA_REG_NONE) {
799 int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
800 struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx];
801 WARN_ON_ONCE(!filter_pdev);
802 if (filter_pdev) {
803 pci_write_config_dword(filter_pdev, reg1->reg,
804 (u32)reg1->config);
805 pci_write_config_dword(filter_pdev, reg1->reg + 4,
806 (u32)(reg1->config >> 32));
807 pci_write_config_dword(filter_pdev, reg2->reg,
808 (u32)reg2->config);
809 pci_write_config_dword(filter_pdev, reg2->reg + 4,
810 (u32)(reg2->config >> 32));
811 }
812 }
813
814 pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
815}
816
817static struct intel_uncore_ops snbep_uncore_qpi_ops = {
818 SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
819 .enable_event = snbep_qpi_enable_event,
820 .hw_config = snbep_qpi_hw_config,
821 .get_constraint = uncore_get_constraint,
822 .put_constraint = uncore_put_constraint,
823};
824
729#define SNBEP_UNCORE_PCI_COMMON_INIT() \ 825#define SNBEP_UNCORE_PCI_COMMON_INIT() \
730 .perf_ctr = SNBEP_PCI_PMON_CTR0, \ 826 .perf_ctr = SNBEP_PCI_PMON_CTR0, \
731 .event_ctl = SNBEP_PCI_PMON_CTL0, \ 827 .event_ctl = SNBEP_PCI_PMON_CTL0, \
@@ -755,17 +851,18 @@ static struct intel_uncore_type snbep_uncore_imc = {
755}; 851};
756 852
757static struct intel_uncore_type snbep_uncore_qpi = { 853static struct intel_uncore_type snbep_uncore_qpi = {
758 .name = "qpi", 854 .name = "qpi",
759 .num_counters = 4, 855 .num_counters = 4,
760 .num_boxes = 2, 856 .num_boxes = 2,
761 .perf_ctr_bits = 48, 857 .perf_ctr_bits = 48,
762 .perf_ctr = SNBEP_PCI_PMON_CTR0, 858 .perf_ctr = SNBEP_PCI_PMON_CTR0,
763 .event_ctl = SNBEP_PCI_PMON_CTL0, 859 .event_ctl = SNBEP_PCI_PMON_CTL0,
764 .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, 860 .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
765 .box_ctl = SNBEP_PCI_PMON_BOX_CTL, 861 .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
766 .ops = &snbep_uncore_pci_ops, 862 .num_shared_regs = 1,
767 .event_descs = snbep_uncore_qpi_events, 863 .ops = &snbep_uncore_qpi_ops,
768 .format_group = &snbep_uncore_qpi_format_group, 864 .event_descs = snbep_uncore_qpi_events,
865 .format_group = &snbep_uncore_qpi_format_group,
769}; 866};
770 867
771 868
@@ -807,43 +904,53 @@ static struct intel_uncore_type *snbep_pci_uncores[] = {
807static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { 904static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
808 { /* Home Agent */ 905 { /* Home Agent */
809 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), 906 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
810 .driver_data = SNBEP_PCI_UNCORE_HA, 907 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
811 }, 908 },
812 { /* MC Channel 0 */ 909 { /* MC Channel 0 */
813 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), 910 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
814 .driver_data = SNBEP_PCI_UNCORE_IMC, 911 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
815 }, 912 },
816 { /* MC Channel 1 */ 913 { /* MC Channel 1 */
817 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), 914 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
818 .driver_data = SNBEP_PCI_UNCORE_IMC, 915 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
819 }, 916 },
820 { /* MC Channel 2 */ 917 { /* MC Channel 2 */
821 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), 918 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
822 .driver_data = SNBEP_PCI_UNCORE_IMC, 919 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
823 }, 920 },
824 { /* MC Channel 3 */ 921 { /* MC Channel 3 */
825 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), 922 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
826 .driver_data = SNBEP_PCI_UNCORE_IMC, 923 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
827 }, 924 },
828 { /* QPI Port 0 */ 925 { /* QPI Port 0 */
829 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), 926 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
830 .driver_data = SNBEP_PCI_UNCORE_QPI, 927 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
831 }, 928 },
832 { /* QPI Port 1 */ 929 { /* QPI Port 1 */
833 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), 930 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
834 .driver_data = SNBEP_PCI_UNCORE_QPI, 931 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
835 }, 932 },
836 { /* R2PCIe */ 933 { /* R2PCIe */
837 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), 934 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
838 .driver_data = SNBEP_PCI_UNCORE_R2PCIE, 935 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
839 }, 936 },
840 { /* R3QPI Link 0 */ 937 { /* R3QPI Link 0 */
841 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), 938 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
842 .driver_data = SNBEP_PCI_UNCORE_R3QPI, 939 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
843 }, 940 },
844 { /* R3QPI Link 1 */ 941 { /* R3QPI Link 1 */
845 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), 942 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
846 .driver_data = SNBEP_PCI_UNCORE_R3QPI, 943 .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
944 },
945 { /* QPI Port 0 filter */
946 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
947 .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
948 SNBEP_PCI_QPI_PORT0_FILTER),
949 },
950 { /* QPI Port 0 filter */
951 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
952 .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
953 SNBEP_PCI_QPI_PORT1_FILTER),
847 }, 954 },
848 { /* end: all zeroes */ } 955 { /* end: all zeroes */ }
849}; 956};
@@ -1256,71 +1363,71 @@ static struct intel_uncore_type *ivt_pci_uncores[] = {
1256static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { 1363static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {
1257 { /* Home Agent 0 */ 1364 { /* Home Agent 0 */
1258 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), 1365 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
1259 .driver_data = IVT_PCI_UNCORE_HA, 1366 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0),
1260 }, 1367 },
1261 { /* Home Agent 1 */ 1368 { /* Home Agent 1 */
1262 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), 1369 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
1263 .driver_data = IVT_PCI_UNCORE_HA, 1370 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1),
1264 }, 1371 },
1265 { /* MC0 Channel 0 */ 1372 { /* MC0 Channel 0 */
1266 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), 1373 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
1267 .driver_data = IVT_PCI_UNCORE_IMC, 1374 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0),
1268 }, 1375 },
1269 { /* MC0 Channel 1 */ 1376 { /* MC0 Channel 1 */
1270 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), 1377 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
1271 .driver_data = IVT_PCI_UNCORE_IMC, 1378 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1),
1272 }, 1379 },
1273 { /* MC0 Channel 3 */ 1380 { /* MC0 Channel 3 */
1274 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), 1381 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
1275 .driver_data = IVT_PCI_UNCORE_IMC, 1382 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2),
1276 }, 1383 },
1277 { /* MC0 Channel 4 */ 1384 { /* MC0 Channel 4 */
1278 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), 1385 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
1279 .driver_data = IVT_PCI_UNCORE_IMC, 1386 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3),
1280 }, 1387 },
1281 { /* MC1 Channel 0 */ 1388 { /* MC1 Channel 0 */
1282 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), 1389 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
1283 .driver_data = IVT_PCI_UNCORE_IMC, 1390 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4),
1284 }, 1391 },
1285 { /* MC1 Channel 1 */ 1392 { /* MC1 Channel 1 */
1286 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), 1393 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
1287 .driver_data = IVT_PCI_UNCORE_IMC, 1394 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5),
1288 }, 1395 },
1289 { /* MC1 Channel 3 */ 1396 { /* MC1 Channel 3 */
1290 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), 1397 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
1291 .driver_data = IVT_PCI_UNCORE_IMC, 1398 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6),
1292 }, 1399 },
1293 { /* MC1 Channel 4 */ 1400 { /* MC1 Channel 4 */
1294 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), 1401 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
1295 .driver_data = IVT_PCI_UNCORE_IMC, 1402 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),
1296 }, 1403 },
1297 { /* QPI0 Port 0 */ 1404 { /* QPI0 Port 0 */
1298 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), 1405 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
1299 .driver_data = IVT_PCI_UNCORE_QPI, 1406 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0),
1300 }, 1407 },
1301 { /* QPI0 Port 1 */ 1408 { /* QPI0 Port 1 */
1302 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), 1409 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
1303 .driver_data = IVT_PCI_UNCORE_QPI, 1410 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1),
1304 }, 1411 },
1305 { /* QPI1 Port 2 */ 1412 { /* QPI1 Port 2 */
1306 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), 1413 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
1307 .driver_data = IVT_PCI_UNCORE_QPI, 1414 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2),
1308 }, 1415 },
1309 { /* R2PCIe */ 1416 { /* R2PCIe */
1310 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), 1417 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
1311 .driver_data = IVT_PCI_UNCORE_R2PCIE, 1418 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0),
1312 }, 1419 },
1313 { /* R3QPI0 Link 0 */ 1420 { /* R3QPI0 Link 0 */
1314 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), 1421 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
1315 .driver_data = IVT_PCI_UNCORE_R3QPI, 1422 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0),
1316 }, 1423 },
1317 { /* R3QPI0 Link 1 */ 1424 { /* R3QPI0 Link 1 */
1318 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), 1425 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
1319 .driver_data = IVT_PCI_UNCORE_R3QPI, 1426 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1),
1320 }, 1427 },
1321 { /* R3QPI1 Link 2 */ 1428 { /* R3QPI1 Link 2 */
1322 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), 1429 PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
1323 .driver_data = IVT_PCI_UNCORE_R3QPI, 1430 .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),
1324 }, 1431 },
1325 { /* end: all zeroes */ } 1432 { /* end: all zeroes */ }
1326}; 1433};
@@ -2606,7 +2713,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp
2606 2713
2607 size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); 2714 size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
2608 2715
2609 box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); 2716 box = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
2610 if (!box) 2717 if (!box)
2611 return NULL; 2718 return NULL;
2612 2719
@@ -3167,16 +3274,24 @@ static bool pcidrv_registered;
3167/* 3274/*
3168 * add a pci uncore device 3275 * add a pci uncore device
3169 */ 3276 */
3170static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) 3277static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
3171{ 3278{
3172 struct intel_uncore_pmu *pmu; 3279 struct intel_uncore_pmu *pmu;
3173 struct intel_uncore_box *box; 3280 struct intel_uncore_box *box;
3174 int i, phys_id; 3281 struct intel_uncore_type *type;
3282 int phys_id;
3175 3283
3176 phys_id = pcibus_to_physid[pdev->bus->number]; 3284 phys_id = pcibus_to_physid[pdev->bus->number];
3177 if (phys_id < 0) 3285 if (phys_id < 0)
3178 return -ENODEV; 3286 return -ENODEV;
3179 3287
3288 if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
3289 extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev;
3290 pci_set_drvdata(pdev, NULL);
3291 return 0;
3292 }
3293
3294 type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
3180 box = uncore_alloc_box(type, 0); 3295 box = uncore_alloc_box(type, 0);
3181 if (!box) 3296 if (!box)
3182 return -ENOMEM; 3297 return -ENOMEM;
@@ -3185,21 +3300,11 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
3185 * for performance monitoring unit with multiple boxes, 3300 * for performance monitoring unit with multiple boxes,
3186 * each box has a different function id. 3301 * each box has a different function id.
3187 */ 3302 */
3188 for (i = 0; i < type->num_boxes; i++) { 3303 pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
3189 pmu = &type->pmus[i]; 3304 if (pmu->func_id < 0)
3190 if (pmu->func_id == pdev->devfn) 3305 pmu->func_id = pdev->devfn;
3191 break; 3306 else
3192 if (pmu->func_id < 0) { 3307 WARN_ON_ONCE(pmu->func_id != pdev->devfn);
3193 pmu->func_id = pdev->devfn;
3194 break;
3195 }
3196 pmu = NULL;
3197 }
3198
3199 if (!pmu) {
3200 kfree(box);
3201 return -EINVAL;
3202 }
3203 3308
3204 box->phys_id = phys_id; 3309 box->phys_id = phys_id;
3205 box->pci_dev = pdev; 3310 box->pci_dev = pdev;
@@ -3217,9 +3322,22 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
3217static void uncore_pci_remove(struct pci_dev *pdev) 3322static void uncore_pci_remove(struct pci_dev *pdev)
3218{ 3323{
3219 struct intel_uncore_box *box = pci_get_drvdata(pdev); 3324 struct intel_uncore_box *box = pci_get_drvdata(pdev);
3220 struct intel_uncore_pmu *pmu = box->pmu; 3325 struct intel_uncore_pmu *pmu;
3221 int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; 3326 int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number];
3222 3327
3328 box = pci_get_drvdata(pdev);
3329 if (!box) {
3330 for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
3331 if (extra_pci_dev[phys_id][i] == pdev) {
3332 extra_pci_dev[phys_id][i] = NULL;
3333 break;
3334 }
3335 }
3336 WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
3337 return;
3338 }
3339
3340 pmu = box->pmu;
3223 if (WARN_ON_ONCE(phys_id != box->phys_id)) 3341 if (WARN_ON_ONCE(phys_id != box->phys_id))
3224 return; 3342 return;
3225 3343
@@ -3240,12 +3358,6 @@ static void uncore_pci_remove(struct pci_dev *pdev)
3240 kfree(box); 3358 kfree(box);
3241} 3359}
3242 3360
3243static int uncore_pci_probe(struct pci_dev *pdev,
3244 const struct pci_device_id *id)
3245{
3246 return uncore_pci_add(pci_uncores[id->driver_data], pdev);
3247}
3248
3249static int __init uncore_pci_init(void) 3361static int __init uncore_pci_init(void)
3250{ 3362{
3251 int ret; 3363 int ret;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 47b3d00c9d89..a80ab71a883d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -12,6 +12,15 @@
12#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC 12#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
13#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) 13#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
14 14
15#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
16#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
17#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
18#define UNCORE_EXTRA_PCI_DEV 0xff
19#define UNCORE_EXTRA_PCI_DEV_MAX 2
20
21/* support up to 8 sockets */
22#define UNCORE_SOCKET_MAX 8
23
15#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) 24#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
16 25
17/* SNB event control */ 26/* SNB event control */
@@ -108,6 +117,7 @@
108 (SNBEP_PMON_CTL_EV_SEL_MASK | \ 117 (SNBEP_PMON_CTL_EV_SEL_MASK | \
109 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ 118 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
110 SNBEP_PMON_CTL_EDGE_DET | \ 119 SNBEP_PMON_CTL_EDGE_DET | \
120 SNBEP_PMON_CTL_EV_SEL_EXT | \
111 SNBEP_PMON_CTL_INVERT | \ 121 SNBEP_PMON_CTL_INVERT | \
112 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ 122 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
113 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ 123 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 7076878404ec..628a059a9a06 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -93,7 +93,7 @@ static void __init vmware_platform_setup(void)
93 * serial key should be enough, as this will always have a VMware 93 * serial key should be enough, as this will always have a VMware
94 * specific string when running under VMware hypervisor. 94 * specific string when running under VMware hypervisor.
95 */ 95 */
96static bool __init vmware_platform(void) 96static uint32_t __init vmware_platform(void)
97{ 97{
98 if (cpu_has_hypervisor) { 98 if (cpu_has_hypervisor) {
99 unsigned int eax; 99 unsigned int eax;
@@ -102,12 +102,12 @@ static bool __init vmware_platform(void)
102 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], 102 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
103 &hyper_vendor_id[1], &hyper_vendor_id[2]); 103 &hyper_vendor_id[1], &hyper_vendor_id[2]);
104 if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) 104 if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
105 return true; 105 return CPUID_VMWARE_INFO_LEAF;
106 } else if (dmi_available && dmi_name_in_serial("VMware") && 106 } else if (dmi_available && dmi_name_in_serial("VMware") &&
107 __vmware_platform()) 107 __vmware_platform())
108 return true; 108 return 1;
109 109
110 return false; 110 return 0;
111} 111}
112 112
113/* 113/*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 74467feb4dc5..e0e0841eef45 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -128,7 +128,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
128 cpu_emergency_svm_disable(); 128 cpu_emergency_svm_disable();
129 129
130 lapic_shutdown(); 130 lapic_shutdown();
131#if defined(CONFIG_X86_IO_APIC) 131#ifdef CONFIG_X86_IO_APIC
132 /* Prevent crash_kexec() from deadlocking on ioapic_lock. */
133 ioapic_zap_locks();
132 disable_IO_APIC(); 134 disable_IO_APIC();
133#endif 135#endif
134#ifdef CONFIG_HPET_TIMER 136#ifdef CONFIG_HPET_TIMER
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d32abeabbda5..174da5fc5a7b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -658,15 +658,18 @@ __init void e820_setup_gap(void)
658 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of 658 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
659 * linked list of struct setup_data, which is parsed here. 659 * linked list of struct setup_data, which is parsed here.
660 */ 660 */
661void __init parse_e820_ext(struct setup_data *sdata) 661void __init parse_e820_ext(u64 phys_addr, u32 data_len)
662{ 662{
663 int entries; 663 int entries;
664 struct e820entry *extmap; 664 struct e820entry *extmap;
665 struct setup_data *sdata;
665 666
667 sdata = early_memremap(phys_addr, data_len);
666 entries = sdata->len / sizeof(struct e820entry); 668 entries = sdata->len / sizeof(struct e820entry);
667 extmap = (struct e820entry *)(sdata->data); 669 extmap = (struct e820entry *)(sdata->data);
668 __append_e820_map(extmap, entries); 670 __append_e820_map(extmap, entries);
669 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 671 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
672 early_iounmap(sdata, data_len);
670 printk(KERN_INFO "e820: extended physical RAM map:\n"); 673 printk(KERN_INFO "e820: extended physical RAM map:\n");
671 e820_print_map("extended"); 674 e820_print_map("extended");
672} 675}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 94ab6b90dd3f..63bdb29b2549 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -196,15 +196,23 @@ static void __init ati_bugs_contd(int num, int slot, int func)
196static void __init intel_remapping_check(int num, int slot, int func) 196static void __init intel_remapping_check(int num, int slot, int func)
197{ 197{
198 u8 revision; 198 u8 revision;
199 u16 device;
199 200
201 device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
200 revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); 202 revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
201 203
202 /* 204 /*
203 * Revision 0x13 of this chipset supports irq remapping 205 * Revision 13 of all triggering devices id in this quirk have
204 * but has an erratum that breaks its behavior, flag it as such 206 * a problem draining interrupts when irq remapping is enabled,
207 * and should be flagged as broken. Additionally revisions 0x12
208 * and 0x22 of device id 0x3405 has this problem.
205 */ 209 */
206 if (revision == 0x13) 210 if (revision == 0x13)
207 set_irq_remapping_broken(); 211 set_irq_remapping_broken();
212 else if ((device == 0x3405) &&
213 ((revision == 0x12) ||
214 (revision == 0x22)))
215 set_irq_remapping_broken();
208 216
209} 217}
210 218
@@ -239,6 +247,8 @@ static struct chipset early_qrk[] __initdata = {
239 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, 247 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
240 { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST, 248 { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST,
241 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, 249 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
250 { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST,
251 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
242 { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, 252 { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
243 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, 253 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
244 {} 254 {}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 138463a24877..06f87bece92a 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void)
29 reserve_ebda_region(); 29 reserve_ebda_region();
30} 30}
31 31
32void __init i386_start_kernel(void) 32asmlinkage void __init i386_start_kernel(void)
33{ 33{
34 sanitize_boot_params(&boot_params); 34 sanitize_boot_params(&boot_params);
35 35
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 55b67614ed94..1be8e43b669e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data)
137 } 137 }
138} 138}
139 139
140void __init x86_64_start_kernel(char * real_mode_data) 140asmlinkage void __init x86_64_start_kernel(char * real_mode_data)
141{ 141{
142 int i; 142 int i;
143 143
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 5dd87a89f011..81ba27679f18 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -409,6 +409,7 @@ enable_paging:
409/* 409/*
410 * Check if it is 486 410 * Check if it is 486
411 */ 411 */
412 movb $4,X86 # at least 486
412 cmpl $-1,X86_CPUID 413 cmpl $-1,X86_CPUID
413 je is486 414 je is486
414 415
@@ -436,7 +437,6 @@ enable_paging:
436 movl %edx,X86_CAPABILITY 437 movl %edx,X86_CAPABILITY
437 438
438is486: 439is486:
439 movb $4,X86
440 movl $0x50022,%ecx # set AM, WP, NE and MP 440 movl $0x50022,%ecx # set AM, WP, NE and MP
441 movl %cr0,%eax 441 movl %cr0,%eax
442 andl $0x80000011,%eax # Save PG,PE,ET 442 andl $0x80000011,%eax # Save PG,PE,ET
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 202d24f0f7e7..5d576ab34403 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -116,7 +116,7 @@ static void mxcsr_feature_mask_init(void)
116 116
117 if (cpu_has_fxsr) { 117 if (cpu_has_fxsr) {
118 memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); 118 memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
119 asm volatile("fxsave %0" : : "m" (fx_scratch)); 119 asm volatile("fxsave %0" : "+m" (fx_scratch));
120 mask = fx_scratch.mxcsr_mask; 120 mask = fx_scratch.mxcsr_mask;
121 if (mask == 0) 121 if (mask == 0)
122 mask = 0x0000ffbf; 122 mask = 0x0000ffbf;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3a8185c042a2..22d0687e7fda 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -177,7 +177,7 @@ u64 arch_irq_stat(void)
177 * SMP cross-CPU interrupts have their own specific 177 * SMP cross-CPU interrupts have their own specific
178 * handlers). 178 * handlers).
179 */ 179 */
180unsigned int __irq_entry do_IRQ(struct pt_regs *regs) 180__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
181{ 181{
182 struct pt_regs *old_regs = set_irq_regs(regs); 182 struct pt_regs *old_regs = set_irq_regs(regs);
183 183
@@ -215,7 +215,7 @@ void __smp_x86_platform_ipi(void)
215 x86_platform_ipi_callback(); 215 x86_platform_ipi_callback();
216} 216}
217 217
218void smp_x86_platform_ipi(struct pt_regs *regs) 218__visible void smp_x86_platform_ipi(struct pt_regs *regs)
219{ 219{
220 struct pt_regs *old_regs = set_irq_regs(regs); 220 struct pt_regs *old_regs = set_irq_regs(regs);
221 221
@@ -229,7 +229,7 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
229/* 229/*
230 * Handler for POSTED_INTERRUPT_VECTOR. 230 * Handler for POSTED_INTERRUPT_VECTOR.
231 */ 231 */
232void smp_kvm_posted_intr_ipi(struct pt_regs *regs) 232__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
233{ 233{
234 struct pt_regs *old_regs = set_irq_regs(regs); 234 struct pt_regs *old_regs = set_irq_regs(regs);
235 235
@@ -247,7 +247,7 @@ void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
247} 247}
248#endif 248#endif
249 249
250void smp_trace_x86_platform_ipi(struct pt_regs *regs) 250__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
251{ 251{
252 struct pt_regs *old_regs = set_irq_regs(regs); 252 struct pt_regs *old_regs = set_irq_regs(regs);
253 253
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 636a55e4a13c..1de84e3ab4e0 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -22,14 +22,14 @@ static inline void __smp_irq_work_interrupt(void)
22 irq_work_run(); 22 irq_work_run();
23} 23}
24 24
25void smp_irq_work_interrupt(struct pt_regs *regs) 25__visible void smp_irq_work_interrupt(struct pt_regs *regs)
26{ 26{
27 irq_work_entering_irq(); 27 irq_work_entering_irq();
28 __smp_irq_work_interrupt(); 28 __smp_irq_work_interrupt();
29 exiting_irq(); 29 exiting_irq();
30} 30}
31 31
32void smp_trace_irq_work_interrupt(struct pt_regs *regs) 32__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
33{ 33{
34 irq_work_entering_irq(); 34 irq_work_entering_irq();
35 trace_irq_work_entry(IRQ_WORK_VECTOR); 35 trace_irq_work_entry(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 2889b3d43882..460f5d9ceebb 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -37,7 +37,19 @@ static void __jump_label_transform(struct jump_entry *entry,
37 } else 37 } else
38 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); 38 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
39 39
40 (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); 40 /*
41 * Make text_poke_bp() a default fallback poker.
42 *
43 * At the time the change is being done, just ignore whether we
44 * are doing nop -> jump or jump -> nop transition, and assume
45 * always nop being the 'currently valid' instruction
46 *
47 */
48 if (poker)
49 (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
50 else
51 text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE,
52 (void *)entry->code + JUMP_LABEL_NOP_SIZE);
41} 53}
42 54
43void arch_jump_label_transform(struct jump_entry *entry, 55void arch_jump_label_transform(struct jump_entry *entry,
@@ -45,7 +57,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
45{ 57{
46 get_online_cpus(); 58 get_online_cpus();
47 mutex_lock(&text_mutex); 59 mutex_lock(&text_mutex);
48 __jump_label_transform(entry, type, text_poke_smp); 60 __jump_label_transform(entry, type, NULL);
49 mutex_unlock(&text_mutex); 61 mutex_unlock(&text_mutex);
50 put_online_cpus(); 62 put_online_cpus();
51} 63}
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index 2e9d4b5af036..c6ee63f927ab 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -82,14 +82,9 @@ extern void synthesize_reljump(void *from, void *to);
82extern void synthesize_relcall(void *from, void *to); 82extern void synthesize_relcall(void *from, void *to);
83 83
84#ifdef CONFIG_OPTPROBES 84#ifdef CONFIG_OPTPROBES
85extern int arch_init_optprobes(void);
86extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); 85extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
87extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); 86extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
88#else /* !CONFIG_OPTPROBES */ 87#else /* !CONFIG_OPTPROBES */
89static inline int arch_init_optprobes(void)
90{
91 return 0;
92}
93static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) 88static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
94{ 89{
95 return 0; 90 return 0;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 211bce445522..79a3f9682871 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -661,7 +661,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
661/* 661/*
662 * Called from kretprobe_trampoline 662 * Called from kretprobe_trampoline
663 */ 663 */
664static __used __kprobes void *trampoline_handler(struct pt_regs *regs) 664__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
665{ 665{
666 struct kretprobe_instance *ri = NULL; 666 struct kretprobe_instance *ri = NULL;
667 struct hlist_head *head, empty_rp; 667 struct hlist_head *head, empty_rp;
@@ -1068,7 +1068,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1068 1068
1069int __init arch_init_kprobes(void) 1069int __init arch_init_kprobes(void)
1070{ 1070{
1071 return arch_init_optprobes(); 1071 return 0;
1072} 1072}
1073 1073
1074int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1074int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 76dc6f095724..898160b42e43 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -88,9 +88,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long v
88 *(unsigned long *)addr = val; 88 *(unsigned long *)addr = val;
89} 89}
90 90
91static void __used __kprobes kprobes_optinsn_template_holder(void) 91asm (
92{
93 asm volatile (
94 ".global optprobe_template_entry\n" 92 ".global optprobe_template_entry\n"
95 "optprobe_template_entry:\n" 93 "optprobe_template_entry:\n"
96#ifdef CONFIG_X86_64 94#ifdef CONFIG_X86_64
@@ -129,7 +127,6 @@ static void __used __kprobes kprobes_optinsn_template_holder(void)
129#endif 127#endif
130 ".global optprobe_template_end\n" 128 ".global optprobe_template_end\n"
131 "optprobe_template_end:\n"); 129 "optprobe_template_end:\n");
132}
133 130
134#define TMPL_MOVE_IDX \ 131#define TMPL_MOVE_IDX \
135 ((long)&optprobe_template_val - (long)&optprobe_template_entry) 132 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
@@ -371,31 +368,6 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
371 return 0; 368 return 0;
372} 369}
373 370
374#define MAX_OPTIMIZE_PROBES 256
375static struct text_poke_param *jump_poke_params;
376static struct jump_poke_buffer {
377 u8 buf[RELATIVEJUMP_SIZE];
378} *jump_poke_bufs;
379
380static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
381 u8 *insn_buf,
382 struct optimized_kprobe *op)
383{
384 s32 rel = (s32)((long)op->optinsn.insn -
385 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
386
387 /* Backup instructions which will be replaced by jump address */
388 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
389 RELATIVE_ADDR_SIZE);
390
391 insn_buf[0] = RELATIVEJUMP_OPCODE;
392 *(s32 *)(&insn_buf[1]) = rel;
393
394 tprm->addr = op->kp.addr;
395 tprm->opcode = insn_buf;
396 tprm->len = RELATIVEJUMP_SIZE;
397}
398
399/* 371/*
400 * Replace breakpoints (int3) with relative jumps. 372 * Replace breakpoints (int3) with relative jumps.
401 * Caller must call with locking kprobe_mutex and text_mutex. 373 * Caller must call with locking kprobe_mutex and text_mutex.
@@ -403,37 +375,38 @@ static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
403void __kprobes arch_optimize_kprobes(struct list_head *oplist) 375void __kprobes arch_optimize_kprobes(struct list_head *oplist)
404{ 376{
405 struct optimized_kprobe *op, *tmp; 377 struct optimized_kprobe *op, *tmp;
406 int c = 0; 378 u8 insn_buf[RELATIVEJUMP_SIZE];
407 379
408 list_for_each_entry_safe(op, tmp, oplist, list) { 380 list_for_each_entry_safe(op, tmp, oplist, list) {
381 s32 rel = (s32)((long)op->optinsn.insn -
382 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
383
409 WARN_ON(kprobe_disabled(&op->kp)); 384 WARN_ON(kprobe_disabled(&op->kp));
410 /* Setup param */ 385
411 setup_optimize_kprobe(&jump_poke_params[c], 386 /* Backup instructions which will be replaced by jump address */
412 jump_poke_bufs[c].buf, op); 387 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
388 RELATIVE_ADDR_SIZE);
389
390 insn_buf[0] = RELATIVEJUMP_OPCODE;
391 *(s32 *)(&insn_buf[1]) = rel;
392
393 text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
394 op->optinsn.insn);
395
413 list_del_init(&op->list); 396 list_del_init(&op->list);
414 if (++c >= MAX_OPTIMIZE_PROBES)
415 break;
416 } 397 }
417
418 /*
419 * text_poke_smp doesn't support NMI/MCE code modifying.
420 * However, since kprobes itself also doesn't support NMI/MCE
421 * code probing, it's not a problem.
422 */
423 text_poke_smp_batch(jump_poke_params, c);
424} 398}
425 399
426static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, 400/* Replace a relative jump with a breakpoint (int3). */
427 u8 *insn_buf, 401void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
428 struct optimized_kprobe *op)
429{ 402{
403 u8 insn_buf[RELATIVEJUMP_SIZE];
404
430 /* Set int3 to first byte for kprobes */ 405 /* Set int3 to first byte for kprobes */
431 insn_buf[0] = BREAKPOINT_INSTRUCTION; 406 insn_buf[0] = BREAKPOINT_INSTRUCTION;
432 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); 407 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
433 408 text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
434 tprm->addr = op->kp.addr; 409 op->optinsn.insn);
435 tprm->opcode = insn_buf;
436 tprm->len = RELATIVEJUMP_SIZE;
437} 410}
438 411
439/* 412/*
@@ -444,34 +417,11 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist,
444 struct list_head *done_list) 417 struct list_head *done_list)
445{ 418{
446 struct optimized_kprobe *op, *tmp; 419 struct optimized_kprobe *op, *tmp;
447 int c = 0;
448 420
449 list_for_each_entry_safe(op, tmp, oplist, list) { 421 list_for_each_entry_safe(op, tmp, oplist, list) {
450 /* Setup param */ 422 arch_unoptimize_kprobe(op);
451 setup_unoptimize_kprobe(&jump_poke_params[c],
452 jump_poke_bufs[c].buf, op);
453 list_move(&op->list, done_list); 423 list_move(&op->list, done_list);
454 if (++c >= MAX_OPTIMIZE_PROBES)
455 break;
456 } 424 }
457
458 /*
459 * text_poke_smp doesn't support NMI/MCE code modifying.
460 * However, since kprobes itself also doesn't support NMI/MCE
461 * code probing, it's not a problem.
462 */
463 text_poke_smp_batch(jump_poke_params, c);
464}
465
466/* Replace a relative jump with a breakpoint (int3). */
467void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
468{
469 u8 buf[RELATIVEJUMP_SIZE];
470
471 /* Set int3 to first byte for kprobes */
472 buf[0] = BREAKPOINT_INSTRUCTION;
473 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
474 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
475} 425}
476 426
477int __kprobes 427int __kprobes
@@ -491,22 +441,3 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
491 } 441 }
492 return 0; 442 return 0;
493} 443}
494
495int __kprobes arch_init_optprobes(void)
496{
497 /* Allocate code buffer and parameter array */
498 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
499 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
500 if (!jump_poke_bufs)
501 return -ENOMEM;
502
503 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
504 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
505 if (!jump_poke_params) {
506 kfree(jump_poke_bufs);
507 jump_poke_bufs = NULL;
508 return -ENOMEM;
509 }
510
511 return 0;
512}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a96d32cc55b8..697b93af02dd 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,6 +34,7 @@
34#include <linux/sched.h> 34#include <linux/sched.h>
35#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/kprobes.h> 36#include <linux/kprobes.h>
37#include <linux/debugfs.h>
37#include <asm/timer.h> 38#include <asm/timer.h>
38#include <asm/cpu.h> 39#include <asm/cpu.h>
39#include <asm/traps.h> 40#include <asm/traps.h>
@@ -419,6 +420,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
419 WARN_ON(kvm_register_clock("primary cpu clock")); 420 WARN_ON(kvm_register_clock("primary cpu clock"));
420 kvm_guest_cpu_init(); 421 kvm_guest_cpu_init();
421 native_smp_prepare_boot_cpu(); 422 native_smp_prepare_boot_cpu();
423 kvm_spinlock_init();
422} 424}
423 425
424static void kvm_guest_cpu_online(void *dummy) 426static void kvm_guest_cpu_online(void *dummy)
@@ -498,11 +500,9 @@ void __init kvm_guest_init(void)
498#endif 500#endif
499} 501}
500 502
501static bool __init kvm_detect(void) 503static uint32_t __init kvm_detect(void)
502{ 504{
503 if (!kvm_para_available()) 505 return kvm_cpuid_base();
504 return false;
505 return true;
506} 506}
507 507
508const struct hypervisor_x86 x86_hyper_kvm __refconst = { 508const struct hypervisor_x86 x86_hyper_kvm __refconst = {
@@ -523,3 +523,263 @@ static __init int activate_jump_labels(void)
523 return 0; 523 return 0;
524} 524}
525arch_initcall(activate_jump_labels); 525arch_initcall(activate_jump_labels);
526
527#ifdef CONFIG_PARAVIRT_SPINLOCKS
528
529/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
530static void kvm_kick_cpu(int cpu)
531{
532 int apicid;
533 unsigned long flags = 0;
534
535 apicid = per_cpu(x86_cpu_to_apicid, cpu);
536 kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
537}
538
539enum kvm_contention_stat {
540 TAKEN_SLOW,
541 TAKEN_SLOW_PICKUP,
542 RELEASED_SLOW,
543 RELEASED_SLOW_KICKED,
544 NR_CONTENTION_STATS
545};
546
547#ifdef CONFIG_KVM_DEBUG_FS
548#define HISTO_BUCKETS 30
549
550static struct kvm_spinlock_stats
551{
552 u32 contention_stats[NR_CONTENTION_STATS];
553 u32 histo_spin_blocked[HISTO_BUCKETS+1];
554 u64 time_blocked;
555} spinlock_stats;
556
557static u8 zero_stats;
558
559static inline void check_zero(void)
560{
561 u8 ret;
562 u8 old;
563
564 old = ACCESS_ONCE(zero_stats);
565 if (unlikely(old)) {
566 ret = cmpxchg(&zero_stats, old, 0);
567 /* This ensures only one fellow resets the stat */
568 if (ret == old)
569 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
570 }
571}
572
573static inline void add_stats(enum kvm_contention_stat var, u32 val)
574{
575 check_zero();
576 spinlock_stats.contention_stats[var] += val;
577}
578
579
580static inline u64 spin_time_start(void)
581{
582 return sched_clock();
583}
584
585static void __spin_time_accum(u64 delta, u32 *array)
586{
587 unsigned index;
588
589 index = ilog2(delta);
590 check_zero();
591
592 if (index < HISTO_BUCKETS)
593 array[index]++;
594 else
595 array[HISTO_BUCKETS]++;
596}
597
598static inline void spin_time_accum_blocked(u64 start)
599{
600 u32 delta;
601
602 delta = sched_clock() - start;
603 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
604 spinlock_stats.time_blocked += delta;
605}
606
607static struct dentry *d_spin_debug;
608static struct dentry *d_kvm_debug;
609
610struct dentry *kvm_init_debugfs(void)
611{
612 d_kvm_debug = debugfs_create_dir("kvm", NULL);
613 if (!d_kvm_debug)
614 printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
615
616 return d_kvm_debug;
617}
618
619static int __init kvm_spinlock_debugfs(void)
620{
621 struct dentry *d_kvm;
622
623 d_kvm = kvm_init_debugfs();
624 if (d_kvm == NULL)
625 return -ENOMEM;
626
627 d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
628
629 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
630
631 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
632 &spinlock_stats.contention_stats[TAKEN_SLOW]);
633 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
634 &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
635
636 debugfs_create_u32("released_slow", 0444, d_spin_debug,
637 &spinlock_stats.contention_stats[RELEASED_SLOW]);
638 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
639 &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
640
641 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
642 &spinlock_stats.time_blocked);
643
644 debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
645 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
646
647 return 0;
648}
649fs_initcall(kvm_spinlock_debugfs);
650#else /* !CONFIG_KVM_DEBUG_FS */
651static inline void add_stats(enum kvm_contention_stat var, u32 val)
652{
653}
654
655static inline u64 spin_time_start(void)
656{
657 return 0;
658}
659
660static inline void spin_time_accum_blocked(u64 start)
661{
662}
663#endif /* CONFIG_KVM_DEBUG_FS */
664
665struct kvm_lock_waiting {
666 struct arch_spinlock *lock;
667 __ticket_t want;
668};
669
670/* cpus 'waiting' on a spinlock to become available */
671static cpumask_t waiting_cpus;
672
673/* Track spinlock on which a cpu is waiting */
674static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
675
676static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
677{
678 struct kvm_lock_waiting *w;
679 int cpu;
680 u64 start;
681 unsigned long flags;
682
683 if (in_nmi())
684 return;
685
686 w = &__get_cpu_var(klock_waiting);
687 cpu = smp_processor_id();
688 start = spin_time_start();
689
690 /*
691 * Make sure an interrupt handler can't upset things in a
692 * partially setup state.
693 */
694 local_irq_save(flags);
695
696 /*
697 * The ordering protocol on this is that the "lock" pointer
698 * may only be set non-NULL if the "want" ticket is correct.
699 * If we're updating "want", we must first clear "lock".
700 */
701 w->lock = NULL;
702 smp_wmb();
703 w->want = want;
704 smp_wmb();
705 w->lock = lock;
706
707 add_stats(TAKEN_SLOW, 1);
708
709 /*
710 * This uses set_bit, which is atomic but we should not rely on its
711 * reordering gurantees. So barrier is needed after this call.
712 */
713 cpumask_set_cpu(cpu, &waiting_cpus);
714
715 barrier();
716
717 /*
718 * Mark entry to slowpath before doing the pickup test to make
719 * sure we don't deadlock with an unlocker.
720 */
721 __ticket_enter_slowpath(lock);
722
723 /*
724 * check again make sure it didn't become free while
725 * we weren't looking.
726 */
727 if (ACCESS_ONCE(lock->tickets.head) == want) {
728 add_stats(TAKEN_SLOW_PICKUP, 1);
729 goto out;
730 }
731
732 /*
733 * halt until it's our turn and kicked. Note that we do safe halt
734 * for irq enabled case to avoid hang when lock info is overwritten
735 * in irq spinlock slowpath and no spurious interrupt occur to save us.
736 */
737 if (arch_irqs_disabled_flags(flags))
738 halt();
739 else
740 safe_halt();
741
742out:
743 cpumask_clear_cpu(cpu, &waiting_cpus);
744 w->lock = NULL;
745 local_irq_restore(flags);
746 spin_time_accum_blocked(start);
747}
748PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
749
750/* Kick vcpu waiting on @lock->head to reach value @ticket */
751static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
752{
753 int cpu;
754
755 add_stats(RELEASED_SLOW, 1);
756 for_each_cpu(cpu, &waiting_cpus) {
757 const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
758 if (ACCESS_ONCE(w->lock) == lock &&
759 ACCESS_ONCE(w->want) == ticket) {
760 add_stats(RELEASED_SLOW_KICKED, 1);
761 kvm_kick_cpu(cpu);
762 break;
763 }
764 }
765}
766
767/*
768 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
769 */
770void __init kvm_spinlock_init(void)
771{
772 if (!kvm_para_available())
773 return;
774 /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
775 if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
776 return;
777
778 printk(KERN_INFO "KVM setup paravirtual spinlock\n");
779
780 static_key_slow_inc(&paravirt_ticketlocks_enabled);
781
782 pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
783 pv_lock_ops.unlock_kick = kvm_unlock_kick;
784}
785#endif /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 47ebb1dbfbcb..7123b5df479d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -145,10 +145,9 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
145 return 0; 145 return 0;
146} 146}
147 147
148static unsigned int verify_patch_size(int cpu, u32 patch_size, 148static unsigned int verify_patch_size(u8 family, u32 patch_size,
149 unsigned int size) 149 unsigned int size)
150{ 150{
151 struct cpuinfo_x86 *c = &cpu_data(cpu);
152 u32 max_size; 151 u32 max_size;
153 152
154#define F1XH_MPB_MAX_SIZE 2048 153#define F1XH_MPB_MAX_SIZE 2048
@@ -156,7 +155,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size,
156#define F15H_MPB_MAX_SIZE 4096 155#define F15H_MPB_MAX_SIZE 4096
157#define F16H_MPB_MAX_SIZE 3458 156#define F16H_MPB_MAX_SIZE 3458
158 157
159 switch (c->x86) { 158 switch (family) {
160 case 0x14: 159 case 0x14:
161 max_size = F14H_MPB_MAX_SIZE; 160 max_size = F14H_MPB_MAX_SIZE;
162 break; 161 break;
@@ -220,12 +219,13 @@ int apply_microcode_amd(int cpu)
220 return 0; 219 return 0;
221 } 220 }
222 221
223 if (__apply_microcode_amd(mc_amd)) 222 if (__apply_microcode_amd(mc_amd)) {
224 pr_err("CPU%d: update failed for patch_level=0x%08x\n", 223 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
225 cpu, mc_amd->hdr.patch_id); 224 cpu, mc_amd->hdr.patch_id);
226 else 225 return -1;
227 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, 226 }
228 mc_amd->hdr.patch_id); 227 pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
228 mc_amd->hdr.patch_id);
229 229
230 uci->cpu_sig.rev = mc_amd->hdr.patch_id; 230 uci->cpu_sig.rev = mc_amd->hdr.patch_id;
231 c->microcode = mc_amd->hdr.patch_id; 231 c->microcode = mc_amd->hdr.patch_id;
@@ -276,9 +276,8 @@ static void cleanup(void)
276 * driver cannot continue functioning normally. In such cases, we tear 276 * driver cannot continue functioning normally. In such cases, we tear
277 * down everything we've used up so far and exit. 277 * down everything we've used up so far and exit.
278 */ 278 */
279static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) 279static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
280{ 280{
281 struct cpuinfo_x86 *c = &cpu_data(cpu);
282 struct microcode_header_amd *mc_hdr; 281 struct microcode_header_amd *mc_hdr;
283 struct ucode_patch *patch; 282 struct ucode_patch *patch;
284 unsigned int patch_size, crnt_size, ret; 283 unsigned int patch_size, crnt_size, ret;
@@ -298,7 +297,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
298 297
299 /* check if patch is for the current family */ 298 /* check if patch is for the current family */
300 proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); 299 proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
301 if (proc_fam != c->x86) 300 if (proc_fam != family)
302 return crnt_size; 301 return crnt_size;
303 302
304 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { 303 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
@@ -307,7 +306,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
307 return crnt_size; 306 return crnt_size;
308 } 307 }
309 308
310 ret = verify_patch_size(cpu, patch_size, leftover); 309 ret = verify_patch_size(family, patch_size, leftover);
311 if (!ret) { 310 if (!ret) {
312 pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); 311 pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
313 return crnt_size; 312 return crnt_size;
@@ -338,7 +337,8 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
338 return crnt_size; 337 return crnt_size;
339} 338}
340 339
341static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t size) 340static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
341 size_t size)
342{ 342{
343 enum ucode_state ret = UCODE_ERROR; 343 enum ucode_state ret = UCODE_ERROR;
344 unsigned int leftover; 344 unsigned int leftover;
@@ -361,7 +361,7 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz
361 } 361 }
362 362
363 while (leftover) { 363 while (leftover) {
364 crnt_size = verify_and_add_patch(cpu, fw, leftover); 364 crnt_size = verify_and_add_patch(family, fw, leftover);
365 if (crnt_size < 0) 365 if (crnt_size < 0)
366 return ret; 366 return ret;
367 367
@@ -372,22 +372,22 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz
372 return UCODE_OK; 372 return UCODE_OK;
373} 373}
374 374
375enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) 375enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
376{ 376{
377 enum ucode_state ret; 377 enum ucode_state ret;
378 378
379 /* free old equiv table */ 379 /* free old equiv table */
380 free_equiv_cpu_table(); 380 free_equiv_cpu_table();
381 381
382 ret = __load_microcode_amd(cpu, data, size); 382 ret = __load_microcode_amd(family, data, size);
383 383
384 if (ret != UCODE_OK) 384 if (ret != UCODE_OK)
385 cleanup(); 385 cleanup();
386 386
387#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) 387#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
388 /* save BSP's matching patch for early load */ 388 /* save BSP's matching patch for early load */
389 if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { 389 if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
390 struct ucode_patch *p = find_patch(cpu); 390 struct ucode_patch *p = find_patch(smp_processor_id());
391 if (p) { 391 if (p) {
392 memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); 392 memset(amd_bsp_mpb, 0, MPB_MAX_SIZE);
393 memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), 393 memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data),
@@ -440,7 +440,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
440 goto fw_release; 440 goto fw_release;
441 } 441 }
442 442
443 ret = load_microcode_amd(cpu, fw->data, fw->size); 443 ret = load_microcode_amd(c->x86, fw->data, fw->size);
444 444
445 fw_release: 445 fw_release:
446 release_firmware(fw); 446 release_firmware(fw);
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c
index 1d14ffee5749..6073104ccaa3 100644
--- a/arch/x86/kernel/microcode_amd_early.c
+++ b/arch/x86/kernel/microcode_amd_early.c
@@ -238,25 +238,17 @@ static void __init collect_cpu_sig_on_bsp(void *arg)
238 uci->cpu_sig.sig = cpuid_eax(0x00000001); 238 uci->cpu_sig.sig = cpuid_eax(0x00000001);
239} 239}
240#else 240#else
241static void collect_cpu_info_amd_early(struct cpuinfo_x86 *c, 241void load_ucode_amd_ap(void)
242 struct ucode_cpu_info *uci)
243{ 242{
243 unsigned int cpu = smp_processor_id();
244 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
244 u32 rev, eax; 245 u32 rev, eax;
245 246
246 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); 247 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
247 eax = cpuid_eax(0x00000001); 248 eax = cpuid_eax(0x00000001);
248 249
249 uci->cpu_sig.sig = eax;
250 uci->cpu_sig.rev = rev; 250 uci->cpu_sig.rev = rev;
251 c->microcode = rev; 251 uci->cpu_sig.sig = eax;
252 c->x86 = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
253}
254
255void load_ucode_amd_ap(void)
256{
257 unsigned int cpu = smp_processor_id();
258
259 collect_cpu_info_amd_early(&cpu_data(cpu), ucode_cpu_info + cpu);
260 252
261 if (cpu && !ucode_loaded) { 253 if (cpu && !ucode_loaded) {
262 void *ucode; 254 void *ucode;
@@ -265,8 +257,10 @@ void load_ucode_amd_ap(void)
265 return; 257 return;
266 258
267 ucode = (void *)(initrd_start + ucode_offset); 259 ucode = (void *)(initrd_start + ucode_offset);
268 if (load_microcode_amd(0, ucode, ucode_size) != UCODE_OK) 260 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
261 if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK)
269 return; 262 return;
263
270 ucode_loaded = true; 264 ucode_loaded = true;
271 } 265 }
272 266
@@ -278,6 +272,8 @@ int __init save_microcode_in_initrd_amd(void)
278{ 272{
279 enum ucode_state ret; 273 enum ucode_state ret;
280 void *ucode; 274 void *ucode;
275 u32 eax;
276
281#ifdef CONFIG_X86_32 277#ifdef CONFIG_X86_32
282 unsigned int bsp = boot_cpu_data.cpu_index; 278 unsigned int bsp = boot_cpu_data.cpu_index;
283 struct ucode_cpu_info *uci = ucode_cpu_info + bsp; 279 struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
@@ -293,7 +289,10 @@ int __init save_microcode_in_initrd_amd(void)
293 return 0; 289 return 0;
294 290
295 ucode = (void *)(initrd_start + ucode_offset); 291 ucode = (void *)(initrd_start + ucode_offset);
296 ret = load_microcode_amd(0, ucode, ucode_size); 292 eax = cpuid_eax(0x00000001);
293 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
294
295 ret = load_microcode_amd(eax, ucode, ucode_size);
297 if (ret != UCODE_OK) 296 if (ret != UCODE_OK)
298 return -EINVAL; 297 return -EINVAL;
299 298
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 676b8c77a976..bbb6c7316341 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -4,25 +4,17 @@
4 */ 4 */
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/jump_label.h>
7 8
8#include <asm/paravirt.h> 9#include <asm/paravirt.h>
9 10
10static inline void
11default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
12{
13 arch_spin_lock(lock);
14}
15
16struct pv_lock_ops pv_lock_ops = { 11struct pv_lock_ops pv_lock_ops = {
17#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
18 .spin_is_locked = __ticket_spin_is_locked, 13 .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
19 .spin_is_contended = __ticket_spin_is_contended, 14 .unlock_kick = paravirt_nop,
20
21 .spin_lock = __ticket_spin_lock,
22 .spin_lock_flags = default_spin_lock_flags,
23 .spin_trylock = __ticket_spin_trylock,
24 .spin_unlock = __ticket_spin_unlock,
25#endif 15#endif
26}; 16};
27EXPORT_SYMBOL(pv_lock_ops); 17EXPORT_SYMBOL(pv_lock_ops);
28 18
19struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
20EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index cd6de64cc480..1b10af835c31 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -62,11 +62,6 @@ void __init default_banner(void)
62 pv_info.name); 62 pv_info.name);
63} 63}
64 64
65/* Simple instruction patching code. */
66#define DEF_NATIVE(ops, name, code) \
67 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
68 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
69
70/* Undefined instruction for dealing with missing ops pointers. */ 65/* Undefined instruction for dealing with missing ops pointers. */
71static const unsigned char ud2a[] = { 0x0f, 0x0b }; 66static const unsigned char ud2a[] = { 0x0f, 0x0b };
72 67
@@ -324,7 +319,7 @@ struct pv_time_ops pv_time_ops = {
324 .steal_clock = native_steal_clock, 319 .steal_clock = native_steal_clock,
325}; 320};
326 321
327struct pv_irq_ops pv_irq_ops = { 322__visible struct pv_irq_ops pv_irq_ops = {
328 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), 323 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
329 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), 324 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
330 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), 325 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
@@ -336,7 +331,7 @@ struct pv_irq_ops pv_irq_ops = {
336#endif 331#endif
337}; 332};
338 333
339struct pv_cpu_ops pv_cpu_ops = { 334__visible struct pv_cpu_ops pv_cpu_ops = {
340 .cpuid = native_cpuid, 335 .cpuid = native_cpuid,
341 .get_debugreg = native_get_debugreg, 336 .get_debugreg = native_get_debugreg,
342 .set_debugreg = native_set_debugreg, 337 .set_debugreg = native_set_debugreg,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 83369e5a1d27..c83516be1052 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -36,7 +36,7 @@
36 * section. Since TSS's are completely CPU-local, we want them 36 * section. Since TSS's are completely CPU-local, we want them
37 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 37 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
38 */ 38 */
39DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; 39__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
40 40
41#ifdef CONFIG_X86_64 41#ifdef CONFIG_X86_64
42static DEFINE_PER_CPU(unsigned char, is_idle); 42static DEFINE_PER_CPU(unsigned char, is_idle);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8adefca71dc..884f98f69354 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -247,7 +247,7 @@ EXPORT_SYMBOL_GPL(start_thread);
247 * the task-switch, and shows up in ret_from_fork in entry.S, 247 * the task-switch, and shows up in ret_from_fork in entry.S,
248 * for example. 248 * for example.
249 */ 249 */
250__notrace_funcgraph struct task_struct * 250__visible __notrace_funcgraph struct task_struct *
251__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 251__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
252{ 252{
253 struct thread_struct *prev = &prev_p->thread, 253 struct thread_struct *prev = &prev_p->thread,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 05646bab4ca6..bb1dc51bab05 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
52 52
53asmlinkage extern void ret_from_fork(void); 53asmlinkage extern void ret_from_fork(void);
54 54
55DEFINE_PER_CPU(unsigned long, old_rsp); 55asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp);
56 56
57/* Prints also some state that isn't saved in the pt_regs */ 57/* Prints also some state that isn't saved in the pt_regs */
58void __show_regs(struct pt_regs *regs, int all) 58void __show_regs(struct pt_regs *regs, int all)
@@ -274,7 +274,7 @@ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
274 * Kprobes not supported here. Set the probe on schedule instead. 274 * Kprobes not supported here. Set the probe on schedule instead.
275 * Function graph tracer not supported too. 275 * Function graph tracer not supported too.
276 */ 276 */
277__notrace_funcgraph struct task_struct * 277__visible __notrace_funcgraph struct task_struct *
278__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 278__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
279{ 279{
280 struct thread_struct *prev = &prev_p->thread; 280 struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85b..a16bae3f83b3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
129} 129}
130 130
131static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
132
133static struct pvclock_vsyscall_time_info *
134pvclock_get_vsyscall_user_time_info(int cpu)
135{
136 if (!pvclock_vdso_info) {
137 BUG();
138 return NULL;
139 }
140
141 return &pvclock_vdso_info[cpu];
142}
143
144struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
145{
146 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
147}
148
149#ifdef CONFIG_X86_64 131#ifdef CONFIG_X86_64
150static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
151 void *v)
152{
153 struct task_migration_notifier *mn = v;
154 struct pvclock_vsyscall_time_info *pvti;
155
156 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
157
158 /* this is NULL when pvclock vsyscall is not initialized */
159 if (unlikely(pvti == NULL))
160 return NOTIFY_DONE;
161
162 pvti->migrate_count++;
163
164 return NOTIFY_DONE;
165}
166
167static struct notifier_block pvclock_migrate = {
168 .notifier_call = pvclock_task_migrate,
169};
170
171/* 132/*
172 * Initialize the generic pvclock vsyscall state. This will allocate 133 * Initialize the generic pvclock vsyscall state. This will allocate
173 * a/some page(s) for the per-vcpu pvclock information, set up a 134 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
181 142
182 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); 143 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
183 144
184 pvclock_vdso_info = i;
185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 145 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 146 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa(i) + (idx*PAGE_SIZE), 147 __pa(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR); 148 PAGE_KERNEL_VVAR);
190 } 149 }
191 150
192
193 register_task_migration_notifier(&pvclock_migrate);
194
195 return 0; 151 return 0;
196} 152}
197#endif 153#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f8ec57815c05..f0de6294b955 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -206,9 +206,9 @@ EXPORT_SYMBOL(boot_cpu_data);
206 206
207 207
208#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) 208#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
209unsigned long mmu_cr4_features; 209__visible unsigned long mmu_cr4_features;
210#else 210#else
211unsigned long mmu_cr4_features = X86_CR4_PAE; 211__visible unsigned long mmu_cr4_features = X86_CR4_PAE;
212#endif 212#endif
213 213
214/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ 214/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
@@ -426,25 +426,23 @@ static void __init reserve_initrd(void)
426static void __init parse_setup_data(void) 426static void __init parse_setup_data(void)
427{ 427{
428 struct setup_data *data; 428 struct setup_data *data;
429 u64 pa_data; 429 u64 pa_data, pa_next;
430 430
431 pa_data = boot_params.hdr.setup_data; 431 pa_data = boot_params.hdr.setup_data;
432 while (pa_data) { 432 while (pa_data) {
433 u32 data_len, map_len; 433 u32 data_len, map_len, data_type;
434 434
435 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), 435 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
436 (u64)sizeof(struct setup_data)); 436 (u64)sizeof(struct setup_data));
437 data = early_memremap(pa_data, map_len); 437 data = early_memremap(pa_data, map_len);
438 data_len = data->len + sizeof(struct setup_data); 438 data_len = data->len + sizeof(struct setup_data);
439 if (data_len > map_len) { 439 data_type = data->type;
440 early_iounmap(data, map_len); 440 pa_next = data->next;
441 data = early_memremap(pa_data, data_len); 441 early_iounmap(data, map_len);
442 map_len = data_len;
443 }
444 442
445 switch (data->type) { 443 switch (data_type) {
446 case SETUP_E820_EXT: 444 case SETUP_E820_EXT:
447 parse_e820_ext(data); 445 parse_e820_ext(pa_data, data_len);
448 break; 446 break;
449 case SETUP_DTB: 447 case SETUP_DTB:
450 add_dtb(pa_data); 448 add_dtb(pa_data);
@@ -452,8 +450,7 @@ static void __init parse_setup_data(void)
452 default: 450 default:
453 break; 451 break;
454 } 452 }
455 pa_data = data->next; 453 pa_data = pa_next;
456 early_iounmap(data, map_len);
457 } 454 }
458} 455}
459 456
@@ -1070,7 +1067,7 @@ void __init setup_arch(char **cmdline_p)
1070 1067
1071 cleanup_highmap(); 1068 cleanup_highmap();
1072 1069
1073 memblock.current_limit = ISA_END_ADDRESS; 1070 memblock_set_current_limit(ISA_END_ADDRESS);
1074 memblock_x86_fill(); 1071 memblock_x86_fill();
1075 1072
1076 /* 1073 /*
@@ -1103,7 +1100,7 @@ void __init setup_arch(char **cmdline_p)
1103 1100
1104 setup_real_mode(); 1101 setup_real_mode();
1105 1102
1106 memblock.current_limit = get_max_mapped(); 1103 memblock_set_current_limit(get_max_mapped());
1107 dma_contiguous_reserve(0); 1104 dma_contiguous_reserve(0);
1108 1105
1109 /* 1106 /*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cf913587d4dd..9e5de6813e1f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -358,7 +358,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
358 else 358 else
359 put_user_ex(0, &frame->uc.uc_flags); 359 put_user_ex(0, &frame->uc.uc_flags);
360 put_user_ex(0, &frame->uc.uc_link); 360 put_user_ex(0, &frame->uc.uc_link);
361 err |= __save_altstack(&frame->uc.uc_stack, regs->sp); 361 save_altstack_ex(&frame->uc.uc_stack, regs->sp);
362 362
363 /* Set up to return from userspace. */ 363 /* Set up to return from userspace. */
364 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 364 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -423,7 +423,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
423 else 423 else
424 put_user_ex(0, &frame->uc.uc_flags); 424 put_user_ex(0, &frame->uc.uc_flags);
425 put_user_ex(0, &frame->uc.uc_link); 425 put_user_ex(0, &frame->uc.uc_link);
426 err |= __save_altstack(&frame->uc.uc_stack, regs->sp); 426 save_altstack_ex(&frame->uc.uc_stack, regs->sp);
427 427
428 /* Set up to return from userspace. If provided, use a stub 428 /* Set up to return from userspace. If provided, use a stub
429 already in userspace. */ 429 already in userspace. */
@@ -490,7 +490,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
490 else 490 else
491 put_user_ex(0, &frame->uc.uc_flags); 491 put_user_ex(0, &frame->uc.uc_flags);
492 put_user_ex(0, &frame->uc.uc_link); 492 put_user_ex(0, &frame->uc.uc_link);
493 err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); 493 compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
494 put_user_ex(0, &frame->uc.uc__pad0); 494 put_user_ex(0, &frame->uc.uc__pad0);
495 495
496 if (ksig->ka.sa.sa_flags & SA_RESTORER) { 496 if (ksig->ka.sa.sa_flags & SA_RESTORER) {
@@ -533,7 +533,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
533 * Do a signal return; undo the signal stack. 533 * Do a signal return; undo the signal stack.
534 */ 534 */
535#ifdef CONFIG_X86_32 535#ifdef CONFIG_X86_32
536unsigned long sys_sigreturn(void) 536asmlinkage unsigned long sys_sigreturn(void)
537{ 537{
538 struct pt_regs *regs = current_pt_regs(); 538 struct pt_regs *regs = current_pt_regs();
539 struct sigframe __user *frame; 539 struct sigframe __user *frame;
@@ -562,7 +562,7 @@ badframe:
562} 562}
563#endif /* CONFIG_X86_32 */ 563#endif /* CONFIG_X86_32 */
564 564
565long sys_rt_sigreturn(void) 565asmlinkage long sys_rt_sigreturn(void)
566{ 566{
567 struct pt_regs *regs = current_pt_regs(); 567 struct pt_regs *regs = current_pt_regs();
568 struct rt_sigframe __user *frame; 568 struct rt_sigframe __user *frame;
@@ -728,7 +728,7 @@ static void do_signal(struct pt_regs *regs)
728 * notification of userspace execution resumption 728 * notification of userspace execution resumption
729 * - triggered by the TIF_WORK_MASK flags 729 * - triggered by the TIF_WORK_MASK flags
730 */ 730 */
731void 731__visible void
732do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 732do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
733{ 733{
734 user_exit(); 734 user_exit();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index cdaa347dfcad..7c3a5a61f2e4 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -256,7 +256,7 @@ static inline void __smp_reschedule_interrupt(void)
256 scheduler_ipi(); 256 scheduler_ipi();
257} 257}
258 258
259void smp_reschedule_interrupt(struct pt_regs *regs) 259__visible void smp_reschedule_interrupt(struct pt_regs *regs)
260{ 260{
261 ack_APIC_irq(); 261 ack_APIC_irq();
262 __smp_reschedule_interrupt(); 262 __smp_reschedule_interrupt();
@@ -271,7 +271,7 @@ static inline void smp_entering_irq(void)
271 irq_enter(); 271 irq_enter();
272} 272}
273 273
274void smp_trace_reschedule_interrupt(struct pt_regs *regs) 274__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
275{ 275{
276 /* 276 /*
277 * Need to call irq_enter() before calling the trace point. 277 * Need to call irq_enter() before calling the trace point.
@@ -295,14 +295,14 @@ static inline void __smp_call_function_interrupt(void)
295 inc_irq_stat(irq_call_count); 295 inc_irq_stat(irq_call_count);
296} 296}
297 297
298void smp_call_function_interrupt(struct pt_regs *regs) 298__visible void smp_call_function_interrupt(struct pt_regs *regs)
299{ 299{
300 smp_entering_irq(); 300 smp_entering_irq();
301 __smp_call_function_interrupt(); 301 __smp_call_function_interrupt();
302 exiting_irq(); 302 exiting_irq();
303} 303}
304 304
305void smp_trace_call_function_interrupt(struct pt_regs *regs) 305__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
306{ 306{
307 smp_entering_irq(); 307 smp_entering_irq();
308 trace_call_function_entry(CALL_FUNCTION_VECTOR); 308 trace_call_function_entry(CALL_FUNCTION_VECTOR);
@@ -317,14 +317,14 @@ static inline void __smp_call_function_single_interrupt(void)
317 inc_irq_stat(irq_call_count); 317 inc_irq_stat(irq_call_count);
318} 318}
319 319
320void smp_call_function_single_interrupt(struct pt_regs *regs) 320__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
321{ 321{
322 smp_entering_irq(); 322 smp_entering_irq();
323 __smp_call_function_single_interrupt(); 323 __smp_call_function_single_interrupt();
324 exiting_irq(); 324 exiting_irq();
325} 325}
326 326
327void smp_trace_call_function_single_interrupt(struct pt_regs *regs) 327__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
328{ 328{
329 smp_entering_irq(); 329 smp_entering_irq();
330 trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); 330 trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index dbded5aedb81..30277e27431a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
101 *begin = new_begin; 101 *begin = new_begin;
102 } 102 }
103 } else { 103 } else {
104 *begin = TASK_UNMAPPED_BASE; 104 *begin = current->mm->mmap_legacy_base;
105 *end = TASK_SIZE; 105 *end = TASK_SIZE;
106 } 106 }
107} 107}
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index 147fcd4941c4..e9bcd57d8a9e 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -15,7 +15,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void);
15 15
16extern asmlinkage void sys_ni_syscall(void); 16extern asmlinkage void sys_ni_syscall(void);
17 17
18const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 18__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
19 /* 19 /*
20 * Smells like a compiler bug -- it doesn't work 20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed. 21 * when the & below is removed.
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 5c7f8c20da74..4ac730b37f0b 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -4,6 +4,7 @@
4#include <linux/sys.h> 4#include <linux/sys.h>
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7#include <asm/syscall.h>
7 8
8#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) 9#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
9 10
@@ -19,11 +20,9 @@
19 20
20#define __SYSCALL_64(nr, sym, compat) [nr] = sym, 21#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
21 22
22typedef void (*sys_call_ptr_t)(void);
23
24extern void sys_ni_syscall(void); 23extern void sys_ni_syscall(void);
25 24
26const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 25asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
27 /* 26 /*
28 * Smells like a compiler bug -- it doesn't work 27 * Smells like a compiler bug -- it doesn't work
29 * when the & below is removed. 28 * when the & below is removed.
diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c
new file mode 100644
index 000000000000..193ec2ce46c7
--- /dev/null
+++ b/arch/x86/kernel/sysfb.c
@@ -0,0 +1,74 @@
1/*
2 * Generic System Framebuffers on x86
3 * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 */
10
11/*
12 * Simple-Framebuffer support for x86 systems
13 * Create a platform-device for any available boot framebuffer. The
14 * simple-framebuffer platform device is already available on DT systems, so
15 * this module parses the global "screen_info" object and creates a suitable
16 * platform device compatible with the "simple-framebuffer" DT object. If
17 * the framebuffer is incompatible, we instead create a legacy
18 * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and
19 * pass the screen_info as platform_data. This allows legacy drivers
20 * to pick these devices up without messing with simple-framebuffer drivers.
21 * The global "screen_info" is still valid at all times.
22 *
23 * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer"
24 * platform devices, but only use legacy framebuffer devices for
25 * backwards compatibility.
26 *
27 * TODO: We set the dev_id field of all platform-devices to 0. This allows
28 * other x86 OF/DT parsers to create such devices, too. However, they must
29 * start at offset 1 for this to work.
30 */
31
32#include <linux/err.h>
33#include <linux/init.h>
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/platform_data/simplefb.h>
37#include <linux/platform_device.h>
38#include <linux/screen_info.h>
39#include <asm/sysfb.h>
40
41static __init int sysfb_init(void)
42{
43 struct screen_info *si = &screen_info;
44 struct simplefb_platform_data mode;
45 struct platform_device *pd;
46 const char *name;
47 bool compatible;
48 int ret;
49
50 sysfb_apply_efi_quirks();
51
52 /* try to create a simple-framebuffer device */
53 compatible = parse_mode(si, &mode);
54 if (compatible) {
55 ret = create_simplefb(si, &mode);
56 if (!ret)
57 return 0;
58 }
59
60 /* if the FB is incompatible, create a legacy framebuffer device */
61 if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
62 name = "efi-framebuffer";
63 else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
64 name = "vesa-framebuffer";
65 else
66 name = "platform-framebuffer";
67
68 pd = platform_device_register_resndata(NULL, name, 0,
69 NULL, 0, si, sizeof(*si));
70 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
71}
72
73/* must execute after PCI subsystem for EFI quirks */
74device_initcall(sysfb_init);
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
new file mode 100644
index 000000000000..b285d4e8c68e
--- /dev/null
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -0,0 +1,214 @@
1/*
2 * Generic System Framebuffers on x86
3 * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
4 *
5 * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2 of the License, or (at your option)
10 * any later version.
11 */
12
13/*
14 * EFI Quirks
15 * Several EFI systems do not correctly advertise their boot framebuffers.
16 * Hence, we use this static table of known broken machines and fix up the
17 * information so framebuffer drivers can load corectly.
18 */
19
20#include <linux/dmi.h>
21#include <linux/err.h>
22#include <linux/init.h>
23#include <linux/kernel.h>
24#include <linux/mm.h>
25#include <linux/pci.h>
26#include <linux/screen_info.h>
27#include <video/vga.h>
28#include <asm/sysfb.h>
29
30enum {
31 OVERRIDE_NONE = 0x0,
32 OVERRIDE_BASE = 0x1,
33 OVERRIDE_STRIDE = 0x2,
34 OVERRIDE_HEIGHT = 0x4,
35 OVERRIDE_WIDTH = 0x8,
36};
37
38struct efifb_dmi_info efifb_dmi_list[] = {
39 [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
40 [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */
41 [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE },
42 [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */
43 [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
44 [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE },
45 [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE },
46 [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE },
47 [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE },
48 [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
49 [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
50 [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
51 [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
52 [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
53 [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
54 /* 11" Macbook Air 3,1 passes the wrong stride */
55 [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE },
56 [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
57 [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */
58 [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
59 [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
60 [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
61 [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
62 [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
63 [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
64 [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
65 [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE },
66 [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
67 [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
68 [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
69};
70
71#define choose_value(dmivalue, fwvalue, field, flags) ({ \
72 typeof(fwvalue) _ret_ = fwvalue; \
73 if ((flags) & (field)) \
74 _ret_ = dmivalue; \
75 else if ((fwvalue) == 0) \
76 _ret_ = dmivalue; \
77 _ret_; \
78 })
79
80static int __init efifb_set_system(const struct dmi_system_id *id)
81{
82 struct efifb_dmi_info *info = id->driver_data;
83
84 if (info->base == 0 && info->height == 0 && info->width == 0 &&
85 info->stride == 0)
86 return 0;
87
88 /* Trust the bootloader over the DMI tables */
89 if (screen_info.lfb_base == 0) {
90#if defined(CONFIG_PCI)
91 struct pci_dev *dev = NULL;
92 int found_bar = 0;
93#endif
94 if (info->base) {
95 screen_info.lfb_base = choose_value(info->base,
96 screen_info.lfb_base, OVERRIDE_BASE,
97 info->flags);
98
99#if defined(CONFIG_PCI)
100 /* make sure that the address in the table is actually
101 * on a VGA device's PCI BAR */
102
103 for_each_pci_dev(dev) {
104 int i;
105 if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
106 continue;
107 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
108 resource_size_t start, end;
109
110 start = pci_resource_start(dev, i);
111 if (start == 0)
112 break;
113 end = pci_resource_end(dev, i);
114 if (screen_info.lfb_base >= start &&
115 screen_info.lfb_base < end) {
116 found_bar = 1;
117 }
118 }
119 }
120 if (!found_bar)
121 screen_info.lfb_base = 0;
122#endif
123 }
124 }
125 if (screen_info.lfb_base) {
126 screen_info.lfb_linelength = choose_value(info->stride,
127 screen_info.lfb_linelength, OVERRIDE_STRIDE,
128 info->flags);
129 screen_info.lfb_width = choose_value(info->width,
130 screen_info.lfb_width, OVERRIDE_WIDTH,
131 info->flags);
132 screen_info.lfb_height = choose_value(info->height,
133 screen_info.lfb_height, OVERRIDE_HEIGHT,
134 info->flags);
135 if (screen_info.orig_video_isVGA == 0)
136 screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
137 } else {
138 screen_info.lfb_linelength = 0;
139 screen_info.lfb_width = 0;
140 screen_info.lfb_height = 0;
141 screen_info.orig_video_isVGA = 0;
142 return 0;
143 }
144
145 printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x "
146 "(%dx%d, stride %d)\n", id->ident,
147 screen_info.lfb_base, screen_info.lfb_width,
148 screen_info.lfb_height, screen_info.lfb_linelength);
149
150 return 1;
151}
152
153#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \
154 { \
155 efifb_set_system, \
156 name, \
157 { \
158 DMI_MATCH(DMI_BIOS_VENDOR, vendor), \
159 DMI_MATCH(DMI_PRODUCT_NAME, name) \
160 }, \
161 &efifb_dmi_list[enumid] \
162 }
163
164static const struct dmi_system_id efifb_dmi_system_table[] __initconst = {
165 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17),
166 /* At least one of these two will be right; maybe both? */
167 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20),
168 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20),
169 /* At least one of these two will be right; maybe both? */
170 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24),
171 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24),
172 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR),
173 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1),
174 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1),
175 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1),
176 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI),
177 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1),
178 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1),
179 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB),
180 /* At least one of these two will be right; maybe both? */
181 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB),
182 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB),
183 /* At least one of these two will be right; maybe both? */
184 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB),
185 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB),
186 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB),
187 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1),
188 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1),
189 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1),
190 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA),
191 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3),
192 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP),
193 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2),
194 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2),
195 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2),
196 EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR),
197 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR),
198 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4),
199 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1),
200 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2),
201 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3),
202 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1),
203 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2),
204 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1),
205 EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2),
206 {},
207};
208
209__init void sysfb_apply_efi_quirks(void)
210{
211 if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI ||
212 !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS))
213 dmi_check_system(efifb_dmi_system_table);
214}
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
new file mode 100644
index 000000000000..22513e96b012
--- /dev/null
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -0,0 +1,95 @@
1/*
2 * Generic System Framebuffers on x86
3 * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 */
10
11/*
12 * simple-framebuffer probing
13 * Try to convert "screen_info" into a "simple-framebuffer" compatible mode.
14 * If the mode is incompatible, we return "false" and let the caller create
15 * legacy nodes instead.
16 */
17
18#include <linux/err.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/platform_data/simplefb.h>
23#include <linux/platform_device.h>
24#include <linux/screen_info.h>
25#include <asm/sysfb.h>
26
27static const char simplefb_resname[] = "BOOTFB";
28static const struct simplefb_format formats[] = SIMPLEFB_FORMATS;
29
30/* try parsing x86 screen_info into a simple-framebuffer mode struct */
31__init bool parse_mode(const struct screen_info *si,
32 struct simplefb_platform_data *mode)
33{
34 const struct simplefb_format *f;
35 __u8 type;
36 unsigned int i;
37
38 type = si->orig_video_isVGA;
39 if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI)
40 return false;
41
42 for (i = 0; i < ARRAY_SIZE(formats); ++i) {
43 f = &formats[i];
44 if (si->lfb_depth == f->bits_per_pixel &&
45 si->red_size == f->red.length &&
46 si->red_pos == f->red.offset &&
47 si->green_size == f->green.length &&
48 si->green_pos == f->green.offset &&
49 si->blue_size == f->blue.length &&
50 si->blue_pos == f->blue.offset &&
51 si->rsvd_size == f->transp.length &&
52 si->rsvd_pos == f->transp.offset) {
53 mode->format = f->name;
54 mode->width = si->lfb_width;
55 mode->height = si->lfb_height;
56 mode->stride = si->lfb_linelength;
57 return true;
58 }
59 }
60
61 return false;
62}
63
64__init int create_simplefb(const struct screen_info *si,
65 const struct simplefb_platform_data *mode)
66{
67 struct platform_device *pd;
68 struct resource res;
69 unsigned long len;
70
71 /* don't use lfb_size as it may contain the whole VMEM instead of only
72 * the part that is occupied by the framebuffer */
73 len = mode->height * mode->stride;
74 len = PAGE_ALIGN(len);
75 if (len > si->lfb_size << 16) {
76 printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
77 return -EINVAL;
78 }
79
80 /* setup IORESOURCE_MEM as framebuffer memory */
81 memset(&res, 0, sizeof(res));
82 res.flags = IORESOURCE_MEM;
83 res.name = simplefb_resname;
84 res.start = si->lfb_base;
85 res.end = si->lfb_base + len - 1;
86 if (res.end <= res.start)
87 return -EINVAL;
88
89 pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0,
90 &res, 1, mode, sizeof(*mode));
91 if (IS_ERR(pd))
92 return PTR_ERR(pd);
93
94 return 0;
95}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index addf7b58f4e8..91a4496db434 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -301,6 +301,15 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
301 return 0; 301 return 0;
302} 302}
303 303
304static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
305{
306 if (!tboot_enabled())
307 return 0;
308
309 pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
310 return -ENODEV;
311}
312
304static atomic_t ap_wfs_count; 313static atomic_t ap_wfs_count;
305 314
306static int tboot_wait_for_aps(int num_aps) 315static int tboot_wait_for_aps(int num_aps)
@@ -422,6 +431,7 @@ static __init int tboot_late_init(void)
422#endif 431#endif
423 432
424 acpi_os_set_prepare_sleep(&tboot_sleep); 433 acpi_os_set_prepare_sleep(&tboot_sleep);
434 acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep);
425 return 0; 435 return 0;
426} 436}
427 437
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1b23a1c92746..8c8093b146ca 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -58,6 +58,7 @@
58#include <asm/mce.h> 58#include <asm/mce.h>
59#include <asm/fixmap.h> 59#include <asm/fixmap.h>
60#include <asm/mach_traps.h> 60#include <asm/mach_traps.h>
61#include <asm/alternative.h>
61 62
62#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
63#include <asm/x86_init.h> 64#include <asm/x86_init.h>
@@ -327,6 +328,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
327 ftrace_int3_handler(regs)) 328 ftrace_int3_handler(regs))
328 return; 329 return;
329#endif 330#endif
331 if (poke_int3_handler(regs))
332 return;
333
330 prev_state = exception_enter(); 334 prev_state = exception_enter();
331#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 335#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
332 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 336 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6ff49247edf8..930e5d48f560 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -89,6 +89,12 @@ int check_tsc_unstable(void)
89} 89}
90EXPORT_SYMBOL_GPL(check_tsc_unstable); 90EXPORT_SYMBOL_GPL(check_tsc_unstable);
91 91
92int check_tsc_disabled(void)
93{
94 return tsc_disabled;
95}
96EXPORT_SYMBOL_GPL(check_tsc_disabled);
97
92#ifdef CONFIG_X86_TSC 98#ifdef CONFIG_X86_TSC
93int __init notsc_setup(char *str) 99int __init notsc_setup(char *str)
94{ 100{
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cbf..b110fe6c03d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
413 (1 << KVM_FEATURE_CLOCKSOURCE2) | 413 (1 << KVM_FEATURE_CLOCKSOURCE2) |
414 (1 << KVM_FEATURE_ASYNC_PF) | 414 (1 << KVM_FEATURE_ASYNC_PF) |
415 (1 << KVM_FEATURE_PV_EOI) | 415 (1 << KVM_FEATURE_PV_EOI) |
416 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 416 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
417 (1 << KVM_FEATURE_PV_UNHALT);
417 418
418 if (sched_info_on()) 419 if (sched_info_on())
419 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); 420 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827c..5439117d5c4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
79 *((u32 *) (apic->regs + reg_off)) = val; 79 *((u32 *) (apic->regs + reg_off)) = val;
80} 80}
81 81
82static inline int apic_test_and_set_vector(int vec, void *bitmap)
83{
84 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
85}
86
87static inline int apic_test_and_clear_vector(int vec, void *bitmap)
88{
89 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
90}
91
92static inline int apic_test_vector(int vec, void *bitmap) 82static inline int apic_test_vector(int vec, void *bitmap)
93{ 83{
94 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 84 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
331} 321}
332EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 322EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
333 323
334static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 324static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
335{ 325{
336 apic->irr_pending = true; 326 apic->irr_pending = true;
337 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); 327 apic_set_vector(vec, apic->regs + APIC_IRR);
338} 328}
339 329
340static inline int apic_search_irr(struct kvm_lapic *apic) 330static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
681 if (unlikely(!apic_enabled(apic))) 671 if (unlikely(!apic_enabled(apic)))
682 break; 672 break;
683 673
674 result = 1;
675
684 if (dest_map) 676 if (dest_map)
685 __set_bit(vcpu->vcpu_id, dest_map); 677 __set_bit(vcpu->vcpu_id, dest_map);
686 678
687 if (kvm_x86_ops->deliver_posted_interrupt) { 679 if (kvm_x86_ops->deliver_posted_interrupt)
688 result = 1;
689 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); 680 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
690 } else { 681 else {
691 result = !apic_test_and_set_irr(vector, apic); 682 apic_set_irr(vector, apic);
692
693 if (!result) {
694 if (trig_mode)
695 apic_debug("level trig mode repeatedly "
696 "for vector %d", vector);
697 goto out;
698 }
699 683
700 kvm_make_request(KVM_REQ_EVENT, vcpu); 684 kvm_make_request(KVM_REQ_EVENT, vcpu);
701 kvm_vcpu_kick(vcpu); 685 kvm_vcpu_kick(vcpu);
702 } 686 }
703out:
704 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 687 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
705 trig_mode, vector, !result); 688 trig_mode, vector, false);
706 break; 689 break;
707 690
708 case APIC_DM_REMRD: 691 case APIC_DM_REMRD:
709 apic_debug("Ignoring delivery mode 3\n"); 692 result = 1;
693 vcpu->arch.pv.pv_unhalted = 1;
694 kvm_make_request(KVM_REQ_EVENT, vcpu);
695 kvm_vcpu_kick(vcpu);
710 break; 696 break;
711 697
712 case APIC_DM_SMI: 698 case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e9285ae9b94..6e2d2c8f230b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
132 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 132 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
133 * PT32_LEVEL_BITS))) - 1)) 133 * PT32_LEVEL_BITS))) - 1))
134 134
135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
136 | PT64_NX_MASK) 136 | shadow_x_mask | shadow_nx_mask)
137 137
138#define ACC_EXEC_MASK 1 138#define ACC_EXEC_MASK 1
139#define ACC_WRITE_MASK PT_WRITABLE_MASK 139#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
331 return pte & PT_PAGE_SIZE_MASK; 331 return pte & PT_PAGE_SIZE_MASK;
332} 332}
333 333
334static int is_dirty_gpte(unsigned long pte)
335{
336 return pte & PT_DIRTY_MASK;
337}
338
339static int is_rmap_spte(u64 pte) 334static int is_rmap_spte(u64 pte)
340{ 335{
341 return is_shadow_present_pte(pte); 336 return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2052 return __shadow_walk_next(iterator, *iterator->sptep); 2047 return __shadow_walk_next(iterator, *iterator->sptep);
2053} 2048}
2054 2049
2055static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 2050static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
2056{ 2051{
2057 u64 spte; 2052 u64 spte;
2058 2053
2054 BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
2055 VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2056
2059 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2057 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
2060 shadow_user_mask | shadow_x_mask | shadow_accessed_mask; 2058 shadow_user_mask | shadow_x_mask;
2059
2060 if (accessed)
2061 spte |= shadow_accessed_mask;
2061 2062
2062 mmu_spte_set(sptep, spte); 2063 mmu_spte_set(sptep, spte);
2063} 2064}
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2574 mmu_free_roots(vcpu); 2575 mmu_free_roots(vcpu);
2575} 2576}
2576 2577
2577static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2578{
2579 int bit7;
2580
2581 bit7 = (gpte >> 7) & 1;
2582 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2583}
2584
2585static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2578static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2586 bool no_dirty_log) 2579 bool no_dirty_log)
2587{ 2580{
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2594 return gfn_to_pfn_memslot_atomic(slot, gfn); 2587 return gfn_to_pfn_memslot_atomic(slot, gfn);
2595} 2588}
2596 2589
2597static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
2598 struct kvm_mmu_page *sp, u64 *spte,
2599 u64 gpte)
2600{
2601 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
2602 goto no_present;
2603
2604 if (!is_present_gpte(gpte))
2605 goto no_present;
2606
2607 if (!(gpte & PT_ACCESSED_MASK))
2608 goto no_present;
2609
2610 return false;
2611
2612no_present:
2613 drop_spte(vcpu->kvm, spte);
2614 return true;
2615}
2616
2617static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2590static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2618 struct kvm_mmu_page *sp, 2591 struct kvm_mmu_page *sp,
2619 u64 *start, u64 *end) 2592 u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2710 iterator.level - 1, 2683 iterator.level - 1,
2711 1, ACC_ALL, iterator.sptep); 2684 1, ACC_ALL, iterator.sptep);
2712 2685
2713 link_shadow_page(iterator.sptep, sp); 2686 link_shadow_page(iterator.sptep, sp, true);
2714 } 2687 }
2715 } 2688 }
2716 return emulate; 2689 return emulate;
@@ -2808,7 +2781,7 @@ exit:
2808 return ret; 2781 return ret;
2809} 2782}
2810 2783
2811static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) 2784static bool page_fault_can_be_fast(u32 error_code)
2812{ 2785{
2813 /* 2786 /*
2814 * Do not fix the mmio spte with invalid generation number which 2787 * Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2861 bool ret = false; 2834 bool ret = false;
2862 u64 spte = 0ull; 2835 u64 spte = 0ull;
2863 2836
2864 if (!page_fault_can_be_fast(vcpu, error_code)) 2837 if (!page_fault_can_be_fast(error_code))
2865 return false; 2838 return false;
2866 2839
2867 walk_shadow_page_lockless_begin(vcpu); 2840 walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3209 mmu_sync_roots(vcpu); 3182 mmu_sync_roots(vcpu);
3210 spin_unlock(&vcpu->kvm->mmu_lock); 3183 spin_unlock(&vcpu->kvm->mmu_lock);
3211} 3184}
3185EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3212 3186
3213static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 3187static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3214 u32 access, struct x86_exception *exception) 3188 u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3478 ++vcpu->stat.tlb_flush; 3452 ++vcpu->stat.tlb_flush;
3479 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3453 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3480} 3454}
3455EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
3481 3456
3482static void paging_new_cr3(struct kvm_vcpu *vcpu) 3457static void paging_new_cr3(struct kvm_vcpu *vcpu)
3483{ 3458{
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
3501 nonpaging_free(vcpu); 3476 nonpaging_free(vcpu);
3502} 3477}
3503 3478
3504static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3505{
3506 unsigned mask;
3507
3508 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
3509
3510 mask = (unsigned)~ACC_WRITE_MASK;
3511 /* Allow write access to dirty gptes */
3512 mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
3513 *access &= mask;
3514}
3515
3516static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, 3479static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3517 unsigned access, int *nr_present) 3480 unsigned access, int *nr_present)
3518{ 3481{
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3530 return false; 3493 return false;
3531} 3494}
3532 3495
3533static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
3534{
3535 unsigned access;
3536
3537 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
3538 access &= ~(gpte >> PT64_NX_SHIFT);
3539
3540 return access;
3541}
3542
3543static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) 3496static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
3544{ 3497{
3545 unsigned index; 3498 unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
3549 return mmu->last_pte_bitmap & (1 << index); 3502 return mmu->last_pte_bitmap & (1 << index);
3550} 3503}
3551 3504
3505#define PTTYPE_EPT 18 /* arbitrary */
3506#define PTTYPE PTTYPE_EPT
3507#include "paging_tmpl.h"
3508#undef PTTYPE
3509
3552#define PTTYPE 64 3510#define PTTYPE 64
3553#include "paging_tmpl.h" 3511#include "paging_tmpl.h"
3554#undef PTTYPE 3512#undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3563 int maxphyaddr = cpuid_maxphyaddr(vcpu); 3521 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3564 u64 exb_bit_rsvd = 0; 3522 u64 exb_bit_rsvd = 0;
3565 3523
3524 context->bad_mt_xwr = 0;
3525
3566 if (!context->nx) 3526 if (!context->nx)
3567 exb_bit_rsvd = rsvd_bits(63, 63); 3527 exb_bit_rsvd = rsvd_bits(63, 63);
3568 switch (context->root_level) { 3528 switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3618 } 3578 }
3619} 3579}
3620 3580
3621static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 3581static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
3582 struct kvm_mmu *context, bool execonly)
3583{
3584 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3585 int pte;
3586
3587 context->rsvd_bits_mask[0][3] =
3588 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
3589 context->rsvd_bits_mask[0][2] =
3590 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
3591 context->rsvd_bits_mask[0][1] =
3592 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
3593 context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
3594
3595 /* large page */
3596 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
3597 context->rsvd_bits_mask[1][2] =
3598 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
3599 context->rsvd_bits_mask[1][1] =
3600 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
3601 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
3602
3603 for (pte = 0; pte < 64; pte++) {
3604 int rwx_bits = pte & 7;
3605 int mt = pte >> 3;
3606 if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
3607 rwx_bits == 0x2 || rwx_bits == 0x6 ||
3608 (rwx_bits == 0x4 && !execonly))
3609 context->bad_mt_xwr |= (1ull << pte);
3610 }
3611}
3612
3613static void update_permission_bitmask(struct kvm_vcpu *vcpu,
3614 struct kvm_mmu *mmu, bool ept)
3622{ 3615{
3623 unsigned bit, byte, pfec; 3616 unsigned bit, byte, pfec;
3624 u8 map; 3617 u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
3636 w = bit & ACC_WRITE_MASK; 3629 w = bit & ACC_WRITE_MASK;
3637 u = bit & ACC_USER_MASK; 3630 u = bit & ACC_USER_MASK;
3638 3631
3639 /* Not really needed: !nx will cause pte.nx to fault */ 3632 if (!ept) {
3640 x |= !mmu->nx; 3633 /* Not really needed: !nx will cause pte.nx to fault */
3641 /* Allow supervisor writes if !cr0.wp */ 3634 x |= !mmu->nx;
3642 w |= !is_write_protection(vcpu) && !uf; 3635 /* Allow supervisor writes if !cr0.wp */
3643 /* Disallow supervisor fetches of user code if cr4.smep */ 3636 w |= !is_write_protection(vcpu) && !uf;
3644 x &= !(smep && u && !uf); 3637 /* Disallow supervisor fetches of user code if cr4.smep */
3638 x &= !(smep && u && !uf);
3639 } else
3640 /* Not really needed: no U/S accesses on ept */
3641 u = 1;
3645 3642
3646 fault = (ff && !x) || (uf && !u) || (wf && !w); 3643 fault = (ff && !x) || (uf && !u) || (wf && !w);
3647 map |= fault << bit; 3644 map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3676 context->root_level = level; 3673 context->root_level = level;
3677 3674
3678 reset_rsvds_bits_mask(vcpu, context); 3675 reset_rsvds_bits_mask(vcpu, context);
3679 update_permission_bitmask(vcpu, context); 3676 update_permission_bitmask(vcpu, context, false);
3680 update_last_pte_bitmap(vcpu, context); 3677 update_last_pte_bitmap(vcpu, context);
3681 3678
3682 ASSERT(is_pae(vcpu)); 3679 ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3706 context->root_level = PT32_ROOT_LEVEL; 3703 context->root_level = PT32_ROOT_LEVEL;
3707 3704
3708 reset_rsvds_bits_mask(vcpu, context); 3705 reset_rsvds_bits_mask(vcpu, context);
3709 update_permission_bitmask(vcpu, context); 3706 update_permission_bitmask(vcpu, context, false);
3710 update_last_pte_bitmap(vcpu, context); 3707 update_last_pte_bitmap(vcpu, context);
3711 3708
3712 context->new_cr3 = paging_new_cr3; 3709 context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3768 context->gva_to_gpa = paging32_gva_to_gpa; 3765 context->gva_to_gpa = paging32_gva_to_gpa;
3769 } 3766 }
3770 3767
3771 update_permission_bitmask(vcpu, context); 3768 update_permission_bitmask(vcpu, context, false);
3772 update_last_pte_bitmap(vcpu, context); 3769 update_last_pte_bitmap(vcpu, context);
3773 3770
3774 return 0; 3771 return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3800} 3797}
3801EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 3798EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3802 3799
3800int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
3801 bool execonly)
3802{
3803 ASSERT(vcpu);
3804 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3805
3806 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
3807
3808 context->nx = true;
3809 context->new_cr3 = paging_new_cr3;
3810 context->page_fault = ept_page_fault;
3811 context->gva_to_gpa = ept_gva_to_gpa;
3812 context->sync_page = ept_sync_page;
3813 context->invlpg = ept_invlpg;
3814 context->update_pte = ept_update_pte;
3815 context->free = paging_free;
3816 context->root_level = context->shadow_root_level;
3817 context->root_hpa = INVALID_PAGE;
3818 context->direct_map = false;
3819
3820 update_permission_bitmask(vcpu, context, true);
3821 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
3822
3823 return 0;
3824}
3825EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
3826
3803static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 3827static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3804{ 3828{
3805 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); 3829 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3847 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3871 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3848 } 3872 }
3849 3873
3850 update_permission_bitmask(vcpu, g_context); 3874 update_permission_bitmask(vcpu, g_context, false);
3851 update_last_pte_bitmap(vcpu, g_context); 3875 update_last_pte_bitmap(vcpu, g_context);
3852 3876
3853 return 0; 3877 return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
3923 return true; 3947 return true;
3924 if ((old ^ new) & PT64_BASE_ADDR_MASK) 3948 if ((old ^ new) & PT64_BASE_ADDR_MASK)
3925 return true; 3949 return true;
3926 old ^= PT64_NX_MASK; 3950 old ^= shadow_nx_mask;
3927 new ^= PT64_NX_MASK; 3951 new ^= shadow_nx_mask;
3928 return (old & ~new & PT64_PERM_MASK) != 0; 3952 return (old & ~new & PT64_PERM_MASK) != 0;
3929} 3953}
3930 3954
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
4182 switch (er) { 4206 switch (er) {
4183 case EMULATE_DONE: 4207 case EMULATE_DONE:
4184 return 1; 4208 return 1;
4185 case EMULATE_DO_MMIO: 4209 case EMULATE_USER_EXIT:
4186 ++vcpu->stat.mmio_exits; 4210 ++vcpu->stat.mmio_exits;
4187 /* fall through */ 4211 /* fall through */
4188 case EMULATE_FAIL: 4212 case EMULATE_FAIL:
@@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
4390 /* 4414 /*
4391 * The very rare case: if the generation-number is round, 4415 * The very rare case: if the generation-number is round,
4392 * zap all shadow pages. 4416 * zap all shadow pages.
4393 *
4394 * The max value is MMIO_MAX_GEN - 1 since it is not called
4395 * when mark memslot invalid.
4396 */ 4417 */
4397 if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { 4418 if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
4398 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); 4419 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
4399 kvm_mmu_invalidate_zap_all_pages(kvm); 4420 kvm_mmu_invalidate_zap_all_pages(kvm);
4400 } 4421 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba7..77e044a0f5f7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
71 71
72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
74int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
75 bool execonly);
74 76
75static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 77static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
76{ 78{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a8..043330159179 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
23 * so the code in this file is compiled twice, once per pte size. 23 * so the code in this file is compiled twice, once per pte size.
24 */ 24 */
25 25
26/*
27 * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
28 * uses for EPT without A/D paging type.
29 */
30extern u64 __pure __using_nonexistent_pte_bit(void)
31 __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
32
26#if PTTYPE == 64 33#if PTTYPE == 64
27 #define pt_element_t u64 34 #define pt_element_t u64
28 #define guest_walker guest_walker64 35 #define guest_walker guest_walker64
@@ -32,6 +39,10 @@
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 39 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 40 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS 41 #define PT_LEVEL_BITS PT64_LEVEL_BITS
42 #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
43 #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
44 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
45 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
35 #ifdef CONFIG_X86_64 46 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4 47 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg 48 #define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 60 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50 #define PT_LEVEL_BITS PT32_LEVEL_BITS 61 #define PT_LEVEL_BITS PT32_LEVEL_BITS
51 #define PT_MAX_FULL_LEVELS 2 62 #define PT_MAX_FULL_LEVELS 2
63 #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
64 #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
65 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
66 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
52 #define CMPXCHG cmpxchg 67 #define CMPXCHG cmpxchg
68#elif PTTYPE == PTTYPE_EPT
69 #define pt_element_t u64
70 #define guest_walker guest_walkerEPT
71 #define FNAME(name) ept_##name
72 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
73 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
74 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
75 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
76 #define PT_LEVEL_BITS PT64_LEVEL_BITS
77 #define PT_GUEST_ACCESSED_MASK 0
78 #define PT_GUEST_DIRTY_MASK 0
79 #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
80 #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
81 #define CMPXCHG cmpxchg64
82 #define PT_MAX_FULL_LEVELS 4
53#else 83#else
54 #error Invalid PTTYPE value 84 #error Invalid PTTYPE value
55#endif 85#endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
80 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 110 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
81} 111}
82 112
113static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
114{
115 unsigned mask;
116
117 /* dirty bit is not supported, so no need to track it */
118 if (!PT_GUEST_DIRTY_MASK)
119 return;
120
121 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
122
123 mask = (unsigned)~ACC_WRITE_MASK;
124 /* Allow write access to dirty gptes */
125 mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
126 PT_WRITABLE_MASK;
127 *access &= mask;
128}
129
130static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
131{
132 int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
133
134 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
135 ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
136}
137
138static inline int FNAME(is_present_gpte)(unsigned long pte)
139{
140#if PTTYPE != PTTYPE_EPT
141 return is_present_gpte(pte);
142#else
143 return pte & 7;
144#endif
145}
146
83static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 147static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
84 pt_element_t __user *ptep_user, unsigned index, 148 pt_element_t __user *ptep_user, unsigned index,
85 pt_element_t orig_pte, pt_element_t new_pte) 149 pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
103 return (ret != orig_pte); 167 return (ret != orig_pte);
104} 168}
105 169
170static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
171 struct kvm_mmu_page *sp, u64 *spte,
172 u64 gpte)
173{
174 if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
175 goto no_present;
176
177 if (!FNAME(is_present_gpte)(gpte))
178 goto no_present;
179
180 /* if accessed bit is not supported prefetch non accessed gpte */
181 if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
182 goto no_present;
183
184 return false;
185
186no_present:
187 drop_spte(vcpu->kvm, spte);
188 return true;
189}
190
191static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
192{
193 unsigned access;
194#if PTTYPE == PTTYPE_EPT
195 access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
196 ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
197 ACC_USER_MASK;
198#else
199 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
200 access &= ~(gpte >> PT64_NX_SHIFT);
201#endif
202
203 return access;
204}
205
106static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, 206static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
107 struct kvm_mmu *mmu, 207 struct kvm_mmu *mmu,
108 struct guest_walker *walker, 208 struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
114 gfn_t table_gfn; 214 gfn_t table_gfn;
115 int ret; 215 int ret;
116 216
217 /* dirty/accessed bits are not supported, so no need to update them */
218 if (!PT_GUEST_DIRTY_MASK)
219 return 0;
220
117 for (level = walker->max_level; level >= walker->level; --level) { 221 for (level = walker->max_level; level >= walker->level; --level) {
118 pte = orig_pte = walker->ptes[level - 1]; 222 pte = orig_pte = walker->ptes[level - 1];
119 table_gfn = walker->table_gfn[level - 1]; 223 table_gfn = walker->table_gfn[level - 1];
120 ptep_user = walker->ptep_user[level - 1]; 224 ptep_user = walker->ptep_user[level - 1];
121 index = offset_in_page(ptep_user) / sizeof(pt_element_t); 225 index = offset_in_page(ptep_user) / sizeof(pt_element_t);
122 if (!(pte & PT_ACCESSED_MASK)) { 226 if (!(pte & PT_GUEST_ACCESSED_MASK)) {
123 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); 227 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
124 pte |= PT_ACCESSED_MASK; 228 pte |= PT_GUEST_ACCESSED_MASK;
125 } 229 }
126 if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { 230 if (level == walker->level && write_fault &&
231 !(pte & PT_GUEST_DIRTY_MASK)) {
127 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
128 pte |= PT_DIRTY_MASK; 233 pte |= PT_GUEST_DIRTY_MASK;
129 } 234 }
130 if (pte == orig_pte) 235 if (pte == orig_pte)
131 continue; 236 continue;
@@ -170,7 +275,7 @@ retry_walk:
170 if (walker->level == PT32E_ROOT_LEVEL) { 275 if (walker->level == PT32E_ROOT_LEVEL) {
171 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); 276 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
172 trace_kvm_mmu_paging_element(pte, walker->level); 277 trace_kvm_mmu_paging_element(pte, walker->level);
173 if (!is_present_gpte(pte)) 278 if (!FNAME(is_present_gpte)(pte))
174 goto error; 279 goto error;
175 --walker->level; 280 --walker->level;
176 } 281 }
@@ -179,7 +284,7 @@ retry_walk:
179 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 284 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
180 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); 285 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
181 286
182 accessed_dirty = PT_ACCESSED_MASK; 287 accessed_dirty = PT_GUEST_ACCESSED_MASK;
183 pt_access = pte_access = ACC_ALL; 288 pt_access = pte_access = ACC_ALL;
184 ++walker->level; 289 ++walker->level;
185 290
@@ -215,17 +320,17 @@ retry_walk:
215 320
216 trace_kvm_mmu_paging_element(pte, walker->level); 321 trace_kvm_mmu_paging_element(pte, walker->level);
217 322
218 if (unlikely(!is_present_gpte(pte))) 323 if (unlikely(!FNAME(is_present_gpte)(pte)))
219 goto error; 324 goto error;
220 325
221 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, 326 if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
222 walker->level))) { 327 walker->level))) {
223 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; 328 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
224 goto error; 329 goto error;
225 } 330 }
226 331
227 accessed_dirty &= pte; 332 accessed_dirty &= pte;
228 pte_access = pt_access & gpte_access(vcpu, pte); 333 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
229 334
230 walker->ptes[walker->level - 1] = pte; 335 walker->ptes[walker->level - 1] = pte;
231 } while (!is_last_gpte(mmu, walker->level, pte)); 336 } while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
248 walker->gfn = real_gpa >> PAGE_SHIFT; 353 walker->gfn = real_gpa >> PAGE_SHIFT;
249 354
250 if (!write_fault) 355 if (!write_fault)
251 protect_clean_gpte(&pte_access, pte); 356 FNAME(protect_clean_gpte)(&pte_access, pte);
252 else 357 else
253 /* 358 /*
254 * On a write fault, fold the dirty bit into accessed_dirty by 359 * On a write fault, fold the dirty bit into accessed_dirty.
255 * shifting it one place right. 360 * For modes without A/D bits support accessed_dirty will be
361 * always clear.
256 */ 362 */
257 accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); 363 accessed_dirty &= pte >>
364 (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
258 365
259 if (unlikely(!accessed_dirty)) { 366 if (unlikely(!accessed_dirty)) {
260 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); 367 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
279 walker->fault.vector = PF_VECTOR; 386 walker->fault.vector = PF_VECTOR;
280 walker->fault.error_code_valid = true; 387 walker->fault.error_code_valid = true;
281 walker->fault.error_code = errcode; 388 walker->fault.error_code = errcode;
389
390#if PTTYPE == PTTYPE_EPT
391 /*
392 * Use PFERR_RSVD_MASK in error_code to to tell if EPT
393 * misconfiguration requires to be injected. The detection is
394 * done by is_rsvd_bits_set() above.
395 *
396 * We set up the value of exit_qualification to inject:
397 * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
398 * [5:3] - Calculated by the page walk of the guest EPT page tables
399 * [7:8] - Derived from [7:8] of real exit_qualification
400 *
401 * The other bits are set to 0.
402 */
403 if (!(errcode & PFERR_RSVD_MASK)) {
404 vcpu->arch.exit_qualification &= 0x187;
405 vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
406 }
407#endif
282 walker->fault.address = addr; 408 walker->fault.address = addr;
283 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 409 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
284 410
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
293 access); 419 access);
294} 420}
295 421
422#if PTTYPE != PTTYPE_EPT
296static int FNAME(walk_addr_nested)(struct guest_walker *walker, 423static int FNAME(walk_addr_nested)(struct guest_walker *walker,
297 struct kvm_vcpu *vcpu, gva_t addr, 424 struct kvm_vcpu *vcpu, gva_t addr,
298 u32 access) 425 u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
300 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, 427 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
301 addr, access); 428 addr, access);
302} 429}
430#endif
303 431
304static bool 432static bool
305FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 433FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
309 gfn_t gfn; 437 gfn_t gfn;
310 pfn_t pfn; 438 pfn_t pfn;
311 439
312 if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) 440 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
313 return false; 441 return false;
314 442
315 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 443 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
316 444
317 gfn = gpte_to_gfn(gpte); 445 gfn = gpte_to_gfn(gpte);
318 pte_access = sp->role.access & gpte_access(vcpu, gpte); 446 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
319 protect_clean_gpte(&pte_access, gpte); 447 FNAME(protect_clean_gpte)(&pte_access, gpte);
320 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 448 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
321 no_dirty_log && (pte_access & ACC_WRITE_MASK)); 449 no_dirty_log && (pte_access & ACC_WRITE_MASK));
322 if (is_error_pfn(pfn)) 450 if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
446 goto out_gpte_changed; 574 goto out_gpte_changed;
447 575
448 if (sp) 576 if (sp)
449 link_shadow_page(it.sptep, sp); 577 link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
450 } 578 }
451 579
452 for (; 580 for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
466 594
467 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, 595 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
468 true, direct_access, it.sptep); 596 true, direct_access, it.sptep);
469 link_shadow_page(it.sptep, sp); 597 link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
470 } 598 }
471 599
472 clear_sp_write_flooding_count(it.sptep); 600 clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
727 return gpa; 855 return gpa;
728} 856}
729 857
858#if PTTYPE != PTTYPE_EPT
730static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 859static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
731 u32 access, 860 u32 access,
732 struct x86_exception *exception) 861 struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
745 874
746 return gpa; 875 return gpa;
747} 876}
877#endif
748 878
749/* 879/*
750 * Using the cached information from sp->gfns is safe because: 880 * Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
785 sizeof(pt_element_t))) 915 sizeof(pt_element_t)))
786 return -EINVAL; 916 return -EINVAL;
787 917
788 if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { 918 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
789 vcpu->kvm->tlbs_dirty++; 919 vcpu->kvm->tlbs_dirty++;
790 continue; 920 continue;
791 } 921 }
792 922
793 gfn = gpte_to_gfn(gpte); 923 gfn = gpte_to_gfn(gpte);
794 pte_access = sp->role.access; 924 pte_access = sp->role.access;
795 pte_access &= gpte_access(vcpu, gpte); 925 pte_access &= FNAME(gpte_access)(vcpu, gpte);
796 protect_clean_gpte(&pte_access, gpte); 926 FNAME(protect_clean_gpte)(&pte_access, gpte);
797 927
798 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, 928 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
799 &nr_present)) 929 &nr_present))
@@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
830#undef gpte_to_gfn 960#undef gpte_to_gfn
831#undef gpte_to_gfn_lvl 961#undef gpte_to_gfn_lvl
832#undef CMPXCHG 962#undef CMPXCHG
963#undef PT_GUEST_ACCESSED_MASK
964#undef PT_GUEST_DIRTY_MASK
965#undef PT_GUEST_DIRTY_SHIFT
966#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index c53e797e7369..5c4f63151b4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
160 160
161static void reprogram_counter(struct kvm_pmc *pmc, u32 type, 161static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
162 unsigned config, bool exclude_user, bool exclude_kernel, 162 unsigned config, bool exclude_user, bool exclude_kernel,
163 bool intr) 163 bool intr, bool in_tx, bool in_tx_cp)
164{ 164{
165 struct perf_event *event; 165 struct perf_event *event;
166 struct perf_event_attr attr = { 166 struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
173 .exclude_kernel = exclude_kernel, 173 .exclude_kernel = exclude_kernel,
174 .config = config, 174 .config = config,
175 }; 175 };
176 if (in_tx)
177 attr.config |= HSW_IN_TX;
178 if (in_tx_cp)
179 attr.config |= HSW_IN_TX_CHECKPOINTED;
176 180
177 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); 181 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
178 182
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
226 230
227 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | 231 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
228 ARCH_PERFMON_EVENTSEL_INV | 232 ARCH_PERFMON_EVENTSEL_INV |
229 ARCH_PERFMON_EVENTSEL_CMASK))) { 233 ARCH_PERFMON_EVENTSEL_CMASK |
234 HSW_IN_TX |
235 HSW_IN_TX_CHECKPOINTED))) {
230 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, 236 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
231 unit_mask); 237 unit_mask);
232 if (config != PERF_COUNT_HW_MAX) 238 if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
239 reprogram_counter(pmc, type, config, 245 reprogram_counter(pmc, type, config,
240 !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 246 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
241 !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 247 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
242 eventsel & ARCH_PERFMON_EVENTSEL_INT); 248 eventsel & ARCH_PERFMON_EVENTSEL_INT,
249 (eventsel & HSW_IN_TX),
250 (eventsel & HSW_IN_TX_CHECKPOINTED));
243} 251}
244 252
245static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) 253static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
256 arch_events[fixed_pmc_events[idx]].event_type, 264 arch_events[fixed_pmc_events[idx]].event_type,
257 !(en & 0x2), /* exclude user */ 265 !(en & 0x2), /* exclude user */
258 !(en & 0x1), /* exclude kernel */ 266 !(en & 0x1), /* exclude kernel */
259 pmi); 267 pmi, false, false);
260} 268}
261 269
262static inline u8 fixed_en_pmi(u64 ctrl, int idx) 270static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
408 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { 416 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
409 if (data == pmc->eventsel) 417 if (data == pmc->eventsel)
410 return 0; 418 return 0;
411 if (!(data & 0xffffffff00200000ull)) { 419 if (!(data & pmu->reserved_bits)) {
412 reprogram_gp_counter(pmc, data); 420 reprogram_gp_counter(pmc, data);
413 return 0; 421 return 0;
414 } 422 }
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
450 pmu->counter_bitmask[KVM_PMC_GP] = 0; 458 pmu->counter_bitmask[KVM_PMC_GP] = 0;
451 pmu->counter_bitmask[KVM_PMC_FIXED] = 0; 459 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
452 pmu->version = 0; 460 pmu->version = 0;
461 pmu->reserved_bits = 0xffffffff00200000ull;
453 462
454 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); 463 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
455 if (!entry) 464 if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
478 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | 487 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
479 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); 488 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
480 pmu->global_ctrl_mask = ~pmu->global_ctrl; 489 pmu->global_ctrl_mask = ~pmu->global_ctrl;
490
491 entry = kvm_find_cpuid_entry(vcpu, 7, 0);
492 if (entry &&
493 (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
494 (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
495 pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
481} 496}
482 497
483void kvm_pmu_init(struct kvm_vcpu *vcpu) 498void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 064d0be67ecc..1f1da43ff2a2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
373 * we must keep them pinned while L2 runs. 373 * we must keep them pinned while L2 runs.
374 */ 374 */
375 struct page *apic_access_page; 375 struct page *apic_access_page;
376 u64 msr_ia32_feature_control;
376}; 377};
377 378
378#define POSTED_INTR_ON 0 379#define POSTED_INTR_ON 0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
711 kvm_release_page_clean(page); 712 kvm_release_page_clean(page);
712} 713}
713 714
715static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
714static u64 construct_eptp(unsigned long root_hpa); 716static u64 construct_eptp(unsigned long root_hpa);
715static void kvm_cpu_vmxon(u64 addr); 717static void kvm_cpu_vmxon(u64 addr);
716static void kvm_cpu_vmxoff(void); 718static void kvm_cpu_vmxoff(void);
717static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
718static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 719static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
719static void vmx_set_segment(struct kvm_vcpu *vcpu, 720static void vmx_set_segment(struct kvm_vcpu *vcpu,
720 struct kvm_segment *var, int seg); 721 struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1039 (vmcs12->secondary_vm_exec_control & bit); 1040 (vmcs12->secondary_vm_exec_control & bit);
1040} 1041}
1041 1042
1042static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, 1043static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1043 struct kvm_vcpu *vcpu)
1044{ 1044{
1045 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1045 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1046} 1046}
1047 1047
1048static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1049{
1050 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1051}
1052
1048static inline bool is_exception(u32 intr_info) 1053static inline bool is_exception(u32 intr_info)
1049{ 1054{
1050 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1055 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2155static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; 2160static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2156static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; 2161static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2157static u32 nested_vmx_misc_low, nested_vmx_misc_high; 2162static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2163static u32 nested_vmx_ept_caps;
2158static __init void nested_vmx_setup_ctls_msrs(void) 2164static __init void nested_vmx_setup_ctls_msrs(void)
2159{ 2165{
2160 /* 2166 /*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2190 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and 2196 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
2191 * 17 must be 1. 2197 * 17 must be 1.
2192 */ 2198 */
2199 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2200 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
2193 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2201 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2194 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ 2202 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
2203 nested_vmx_exit_ctls_high &=
2195#ifdef CONFIG_X86_64 2204#ifdef CONFIG_X86_64
2196 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; 2205 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2197#else
2198 nested_vmx_exit_ctls_high = 0;
2199#endif 2206#endif
2200 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2207 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2208 nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2209 VM_EXIT_LOAD_IA32_EFER);
2201 2210
2202 /* entry controls */ 2211 /* entry controls */
2203 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2212 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2205 /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ 2214 /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
2206 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2215 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2207 nested_vmx_entry_ctls_high &= 2216 nested_vmx_entry_ctls_high &=
2208 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; 2217#ifdef CONFIG_X86_64
2209 nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2218 VM_ENTRY_IA32E_MODE |
2219#endif
2220 VM_ENTRY_LOAD_IA32_PAT;
2221 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
2222 VM_ENTRY_LOAD_IA32_EFER);
2210 2223
2211 /* cpu-based controls */ 2224 /* cpu-based controls */
2212 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2225 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2241 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2254 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2242 SECONDARY_EXEC_WBINVD_EXITING; 2255 SECONDARY_EXEC_WBINVD_EXITING;
2243 2256
2257 if (enable_ept) {
2258 /* nested EPT: emulate EPT also to L1 */
2259 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
2260 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2261 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2262 nested_vmx_ept_caps &= vmx_capability.ept;
2263 /*
2264 * Since invept is completely emulated we support both global
2265 * and context invalidation independent of what host cpu
2266 * supports
2267 */
2268 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2269 VMX_EPT_EXTENT_CONTEXT_BIT;
2270 } else
2271 nested_vmx_ept_caps = 0;
2272
2244 /* miscellaneous data */ 2273 /* miscellaneous data */
2245 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2274 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2246 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | 2275 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2282 2311
2283 switch (msr_index) { 2312 switch (msr_index) {
2284 case MSR_IA32_FEATURE_CONTROL: 2313 case MSR_IA32_FEATURE_CONTROL:
2285 *pdata = 0; 2314 if (nested_vmx_allowed(vcpu)) {
2286 break; 2315 *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2316 break;
2317 }
2318 return 0;
2287 case MSR_IA32_VMX_BASIC: 2319 case MSR_IA32_VMX_BASIC:
2288 /* 2320 /*
2289 * This MSR reports some information about VMX support. We 2321 * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2346 nested_vmx_secondary_ctls_high); 2378 nested_vmx_secondary_ctls_high);
2347 break; 2379 break;
2348 case MSR_IA32_VMX_EPT_VPID_CAP: 2380 case MSR_IA32_VMX_EPT_VPID_CAP:
2349 /* Currently, no nested ept or nested vpid */ 2381 /* Currently, no nested vpid support */
2350 *pdata = 0; 2382 *pdata = nested_vmx_ept_caps;
2351 break; 2383 break;
2352 default: 2384 default:
2353 return 0; 2385 return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2356 return 1; 2388 return 1;
2357} 2389}
2358 2390
2359static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 2391static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2360{ 2392{
2393 u32 msr_index = msr_info->index;
2394 u64 data = msr_info->data;
2395 bool host_initialized = msr_info->host_initiated;
2396
2361 if (!nested_vmx_allowed(vcpu)) 2397 if (!nested_vmx_allowed(vcpu))
2362 return 0; 2398 return 0;
2363 2399
2364 if (msr_index == MSR_IA32_FEATURE_CONTROL) 2400 if (msr_index == MSR_IA32_FEATURE_CONTROL) {
2365 /* TODO: the right thing. */ 2401 if (!host_initialized &&
2402 to_vmx(vcpu)->nested.msr_ia32_feature_control
2403 & FEATURE_CONTROL_LOCKED)
2404 return 0;
2405 to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
2366 return 1; 2406 return 1;
2407 }
2408
2367 /* 2409 /*
2368 * No need to treat VMX capability MSRs specially: If we don't handle 2410 * No need to treat VMX capability MSRs specially: If we don't handle
2369 * them, handle_wrmsr will #GP(0), which is correct (they are readonly) 2411 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2494 return 1; 2536 return 1;
2495 /* Otherwise falls through */ 2537 /* Otherwise falls through */
2496 default: 2538 default:
2497 if (vmx_set_vmx_msr(vcpu, msr_index, data)) 2539 if (vmx_set_vmx_msr(vcpu, msr_info))
2498 break; 2540 break;
2499 msr = find_msr_entry(vmx, msr_index); 2541 msr = find_msr_entry(vmx, msr_index);
2500 if (msr) { 2542 if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
5302 5344
5303 /* It is a write fault? */ 5345 /* It is a write fault? */
5304 error_code = exit_qualification & (1U << 1); 5346 error_code = exit_qualification & (1U << 1);
5347 /* It is a fetch fault? */
5348 error_code |= (exit_qualification & (1U << 2)) << 2;
5305 /* ept page table is present? */ 5349 /* ept page table is present? */
5306 error_code |= (exit_qualification >> 3) & 0x1; 5350 error_code |= (exit_qualification >> 3) & 0x1;
5307 5351
5352 vcpu->arch.exit_qualification = exit_qualification;
5353
5308 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5354 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5309} 5355}
5310 5356
@@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5438 5484
5439 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); 5485 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
5440 5486
5441 if (err == EMULATE_DO_MMIO) { 5487 if (err == EMULATE_USER_EXIT) {
5488 ++vcpu->stat.mmio_exits;
5442 ret = 0; 5489 ret = 0;
5443 goto out; 5490 goto out;
5444 } 5491 }
@@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
5567 free_loaded_vmcs(&vmx->vmcs01); 5614 free_loaded_vmcs(&vmx->vmcs01);
5568} 5615}
5569 5616
5617/*
5618 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5619 * set the success or error code of an emulated VMX instruction, as specified
5620 * by Vol 2B, VMX Instruction Reference, "Conventions".
5621 */
5622static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5623{
5624 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5625 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5626 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5627}
5628
5629static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5630{
5631 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5632 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5633 X86_EFLAGS_SF | X86_EFLAGS_OF))
5634 | X86_EFLAGS_CF);
5635}
5636
5570static void nested_vmx_failValid(struct kvm_vcpu *vcpu, 5637static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5571 u32 vm_instruction_error); 5638 u32 vm_instruction_error)
5639{
5640 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5641 /*
5642 * failValid writes the error number to the current VMCS, which
5643 * can't be done there isn't a current VMCS.
5644 */
5645 nested_vmx_failInvalid(vcpu);
5646 return;
5647 }
5648 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5649 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5650 X86_EFLAGS_SF | X86_EFLAGS_OF))
5651 | X86_EFLAGS_ZF);
5652 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5653 /*
5654 * We don't need to force a shadow sync because
5655 * VM_INSTRUCTION_ERROR is not shadowed
5656 */
5657}
5572 5658
5573/* 5659/*
5574 * Emulate the VMXON instruction. 5660 * Emulate the VMXON instruction.
@@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5583 struct kvm_segment cs; 5669 struct kvm_segment cs;
5584 struct vcpu_vmx *vmx = to_vmx(vcpu); 5670 struct vcpu_vmx *vmx = to_vmx(vcpu);
5585 struct vmcs *shadow_vmcs; 5671 struct vmcs *shadow_vmcs;
5672 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
5673 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
5586 5674
5587 /* The Intel VMX Instruction Reference lists a bunch of bits that 5675 /* The Intel VMX Instruction Reference lists a bunch of bits that
5588 * are prerequisite to running VMXON, most notably cr4.VMXE must be 5676 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5611 skip_emulated_instruction(vcpu); 5699 skip_emulated_instruction(vcpu);
5612 return 1; 5700 return 1;
5613 } 5701 }
5702
5703 if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
5704 != VMXON_NEEDED_FEATURES) {
5705 kvm_inject_gp(vcpu, 0);
5706 return 1;
5707 }
5708
5614 if (enable_shadow_vmcs) { 5709 if (enable_shadow_vmcs) {
5615 shadow_vmcs = alloc_vmcs(); 5710 shadow_vmcs = alloc_vmcs();
5616 if (!shadow_vmcs) 5711 if (!shadow_vmcs)
@@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5628 vmx->nested.vmxon = true; 5723 vmx->nested.vmxon = true;
5629 5724
5630 skip_emulated_instruction(vcpu); 5725 skip_emulated_instruction(vcpu);
5726 nested_vmx_succeed(vcpu);
5631 return 1; 5727 return 1;
5632} 5728}
5633 5729
@@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
5712 return 1; 5808 return 1;
5713 free_nested(to_vmx(vcpu)); 5809 free_nested(to_vmx(vcpu));
5714 skip_emulated_instruction(vcpu); 5810 skip_emulated_instruction(vcpu);
5811 nested_vmx_succeed(vcpu);
5715 return 1; 5812 return 1;
5716} 5813}
5717 5814
@@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
5768 return 0; 5865 return 0;
5769} 5866}
5770 5867
5771/*
5772 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5773 * set the success or error code of an emulated VMX instruction, as specified
5774 * by Vol 2B, VMX Instruction Reference, "Conventions".
5775 */
5776static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5777{
5778 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5779 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5780 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5781}
5782
5783static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5784{
5785 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5786 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5787 X86_EFLAGS_SF | X86_EFLAGS_OF))
5788 | X86_EFLAGS_CF);
5789}
5790
5791static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5792 u32 vm_instruction_error)
5793{
5794 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5795 /*
5796 * failValid writes the error number to the current VMCS, which
5797 * can't be done there isn't a current VMCS.
5798 */
5799 nested_vmx_failInvalid(vcpu);
5800 return;
5801 }
5802 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5803 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5804 X86_EFLAGS_SF | X86_EFLAGS_OF))
5805 | X86_EFLAGS_ZF);
5806 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5807 /*
5808 * We don't need to force a shadow sync because
5809 * VM_INSTRUCTION_ERROR is not shadowed
5810 */
5811}
5812
5813/* Emulate the VMCLEAR instruction */ 5868/* Emulate the VMCLEAR instruction */
5814static int handle_vmclear(struct kvm_vcpu *vcpu) 5869static int handle_vmclear(struct kvm_vcpu *vcpu)
5815{ 5870{
@@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
5972 unsigned long field; 6027 unsigned long field;
5973 u64 field_value; 6028 u64 field_value;
5974 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6029 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
5975 unsigned long *fields = (unsigned long *)shadow_read_write_fields; 6030 const unsigned long *fields = shadow_read_write_fields;
5976 int num_fields = max_shadow_read_write_fields; 6031 const int num_fields = max_shadow_read_write_fields;
5977 6032
5978 vmcs_load(shadow_vmcs); 6033 vmcs_load(shadow_vmcs);
5979 6034
@@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
6002 6057
6003static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 6058static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6004{ 6059{
6005 unsigned long *fields[] = { 6060 const unsigned long *fields[] = {
6006 (unsigned long *)shadow_read_write_fields, 6061 shadow_read_write_fields,
6007 (unsigned long *)shadow_read_only_fields 6062 shadow_read_only_fields
6008 }; 6063 };
6009 int num_lists = ARRAY_SIZE(fields); 6064 const int max_fields[] = {
6010 int max_fields[] = {
6011 max_shadow_read_write_fields, 6065 max_shadow_read_write_fields,
6012 max_shadow_read_only_fields 6066 max_shadow_read_only_fields
6013 }; 6067 };
@@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6018 6072
6019 vmcs_load(shadow_vmcs); 6073 vmcs_load(shadow_vmcs);
6020 6074
6021 for (q = 0; q < num_lists; q++) { 6075 for (q = 0; q < ARRAY_SIZE(fields); q++) {
6022 for (i = 0; i < max_fields[q]; i++) { 6076 for (i = 0; i < max_fields[q]; i++) {
6023 field = fields[q][i]; 6077 field = fields[q][i];
6024 vmcs12_read_any(&vmx->vcpu, field, &field_value); 6078 vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
6248 return 1; 6302 return 1;
6249} 6303}
6250 6304
6305/* Emulate the INVEPT instruction */
6306static int handle_invept(struct kvm_vcpu *vcpu)
6307{
6308 u32 vmx_instruction_info, types;
6309 unsigned long type;
6310 gva_t gva;
6311 struct x86_exception e;
6312 struct {
6313 u64 eptp, gpa;
6314 } operand;
6315 u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
6316
6317 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
6318 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
6319 kvm_queue_exception(vcpu, UD_VECTOR);
6320 return 1;
6321 }
6322
6323 if (!nested_vmx_check_permission(vcpu))
6324 return 1;
6325
6326 if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
6327 kvm_queue_exception(vcpu, UD_VECTOR);
6328 return 1;
6329 }
6330
6331 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6332 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
6333
6334 types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
6335
6336 if (!(types & (1UL << type))) {
6337 nested_vmx_failValid(vcpu,
6338 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6339 return 1;
6340 }
6341
6342 /* According to the Intel VMX instruction reference, the memory
6343 * operand is read even if it isn't needed (e.g., for type==global)
6344 */
6345 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6346 vmx_instruction_info, &gva))
6347 return 1;
6348 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
6349 sizeof(operand), &e)) {
6350 kvm_inject_page_fault(vcpu, &e);
6351 return 1;
6352 }
6353
6354 switch (type) {
6355 case VMX_EPT_EXTENT_CONTEXT:
6356 if ((operand.eptp & eptp_mask) !=
6357 (nested_ept_get_cr3(vcpu) & eptp_mask))
6358 break;
6359 case VMX_EPT_EXTENT_GLOBAL:
6360 kvm_mmu_sync_roots(vcpu);
6361 kvm_mmu_flush_tlb(vcpu);
6362 nested_vmx_succeed(vcpu);
6363 break;
6364 default:
6365 BUG_ON(1);
6366 break;
6367 }
6368
6369 skip_emulated_instruction(vcpu);
6370 return 1;
6371}
6372
6251/* 6373/*
6252 * The exit handlers return 1 if the exit was handled fully and guest execution 6374 * The exit handlers return 1 if the exit was handled fully and guest execution
6253 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6375 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6292 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6414 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6293 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, 6415 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
6294 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, 6416 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
6417 [EXIT_REASON_INVEPT] = handle_invept,
6295}; 6418};
6296 6419
6297static const int kvm_vmx_max_exit_handlers = 6420static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6518 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 6641 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
6519 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 6642 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
6520 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6643 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6644 case EXIT_REASON_INVEPT:
6521 /* 6645 /*
6522 * VMX instructions trap unconditionally. This allows L1 to 6646 * VMX instructions trap unconditionally. This allows L1 to
6523 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6647 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6550 return nested_cpu_has2(vmcs12, 6674 return nested_cpu_has2(vmcs12,
6551 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 6675 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
6552 case EXIT_REASON_EPT_VIOLATION: 6676 case EXIT_REASON_EPT_VIOLATION:
6677 /*
6678 * L0 always deals with the EPT violation. If nested EPT is
6679 * used, and the nested mmu code discovers that the address is
6680 * missing in the guest EPT table (EPT12), the EPT violation
6681 * will be injected with nested_ept_inject_page_fault()
6682 */
6683 return 0;
6553 case EXIT_REASON_EPT_MISCONFIG: 6684 case EXIT_REASON_EPT_MISCONFIG:
6685 /*
6686 * L2 never uses directly L1's EPT, but rather L0's own EPT
6687 * table (shadow on EPT) or a merged EPT table that L0 built
6688 * (EPT on EPT). So any problems with the structure of the
6689 * table is L0's fault.
6690 */
6554 return 0; 6691 return 0;
6555 case EXIT_REASON_PREEMPTION_TIMER: 6692 case EXIT_REASON_PREEMPTION_TIMER:
6556 return vmcs12->pin_based_vm_exec_control & 6693 return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6638 6775
6639 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 6776 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
6640 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 6777 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
6641 get_vmcs12(vcpu), vcpu)))) { 6778 get_vmcs12(vcpu))))) {
6642 if (vmx_interrupt_allowed(vcpu)) { 6779 if (vmx_interrupt_allowed(vcpu)) {
6643 vmx->soft_vnmi_blocked = 0; 6780 vmx->soft_vnmi_blocked = 0;
6644 } else if (vmx->vnmi_blocked_time > 1000000000LL && 6781 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
7326 entry->ecx |= bit(X86_FEATURE_VMX); 7463 entry->ecx |= bit(X86_FEATURE_VMX);
7327} 7464}
7328 7465
7466static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
7467 struct x86_exception *fault)
7468{
7469 struct vmcs12 *vmcs12;
7470 nested_vmx_vmexit(vcpu);
7471 vmcs12 = get_vmcs12(vcpu);
7472
7473 if (fault->error_code & PFERR_RSVD_MASK)
7474 vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
7475 else
7476 vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
7477 vmcs12->exit_qualification = vcpu->arch.exit_qualification;
7478 vmcs12->guest_physical_address = fault->address;
7479}
7480
7481/* Callbacks for nested_ept_init_mmu_context: */
7482
7483static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
7484{
7485 /* return the page table to be shadowed - in our case, EPT12 */
7486 return get_vmcs12(vcpu)->ept_pointer;
7487}
7488
7489static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
7490{
7491 int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
7492 nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
7493
7494 vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
7495 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
7496 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
7497
7498 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
7499
7500 return r;
7501}
7502
7503static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
7504{
7505 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
7506}
7507
7329/* 7508/*
7330 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 7509 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
7331 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 7510 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7388 vmcs12->guest_interruptibility_info); 7567 vmcs12->guest_interruptibility_info);
7389 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 7568 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
7390 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 7569 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
7391 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); 7570 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
7392 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 7571 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
7393 vmcs12->guest_pending_dbg_exceptions); 7572 vmcs12->guest_pending_dbg_exceptions);
7394 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 7573 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7508 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 7687 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
7509 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 7688 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
7510 7689
7511 /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ 7690 /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
7512 vmcs_write32(VM_EXIT_CONTROLS, 7691 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
7513 vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); 7692 * bits are further modified by vmx_set_efer() below.
7514 vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | 7693 */
7694 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
7695
7696 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
7697 * emulated by vmx_set_efer(), below.
7698 */
7699 vmcs_write32(VM_ENTRY_CONTROLS,
7700 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
7701 ~VM_ENTRY_IA32E_MODE) |
7515 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 7702 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
7516 7703
7517 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) 7704 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
7518 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 7705 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
7519 else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 7706 vcpu->arch.pat = vmcs12->guest_ia32_pat;
7707 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
7520 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 7708 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
7521 7709
7522 7710
@@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7538 vmx_flush_tlb(vcpu); 7726 vmx_flush_tlb(vcpu);
7539 } 7727 }
7540 7728
7729 if (nested_cpu_has_ept(vmcs12)) {
7730 kvm_mmu_unload(vcpu);
7731 nested_ept_init_mmu_context(vcpu);
7732 }
7733
7541 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 7734 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
7542 vcpu->arch.efer = vmcs12->guest_ia32_efer; 7735 vcpu->arch.efer = vmcs12->guest_ia32_efer;
7543 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 7736 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7565 kvm_set_cr3(vcpu, vmcs12->guest_cr3); 7758 kvm_set_cr3(vcpu, vmcs12->guest_cr3);
7566 kvm_mmu_reset_context(vcpu); 7759 kvm_mmu_reset_context(vcpu);
7567 7760
7761 /*
7762 * L1 may access the L2's PDPTR, so save them to construct vmcs12
7763 */
7764 if (enable_ept) {
7765 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
7766 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
7767 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
7768 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
7769 }
7770
7568 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); 7771 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
7569 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); 7772 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
7570} 7773}
@@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7887 vmcs12->guest_pending_dbg_exceptions = 8090 vmcs12->guest_pending_dbg_exceptions =
7888 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 8091 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
7889 8092
8093 /*
8094 * In some cases (usually, nested EPT), L2 is allowed to change its
8095 * own CR3 without exiting. If it has changed it, we must keep it.
8096 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
8097 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
8098 *
8099 * Additionally, restore L2's PDPTR to vmcs12.
8100 */
8101 if (enable_ept) {
8102 vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
8103 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
8104 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
8105 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
8106 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
8107 }
8108
7890 vmcs12->vm_entry_controls = 8109 vmcs12->vm_entry_controls =
7891 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 8110 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
7892 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); 8111 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7948static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 8167static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7949 struct vmcs12 *vmcs12) 8168 struct vmcs12 *vmcs12)
7950{ 8169{
8170 struct kvm_segment seg;
8171
7951 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 8172 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
7952 vcpu->arch.efer = vmcs12->host_ia32_efer; 8173 vcpu->arch.efer = vmcs12->host_ia32_efer;
7953 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 8174 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7982 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 8203 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
7983 kvm_set_cr4(vcpu, vmcs12->host_cr4); 8204 kvm_set_cr4(vcpu, vmcs12->host_cr4);
7984 8205
7985 /* shadow page tables on either EPT or shadow page tables */ 8206 if (nested_cpu_has_ept(vmcs12))
8207 nested_ept_uninit_mmu_context(vcpu);
8208
7986 kvm_set_cr3(vcpu, vmcs12->host_cr3); 8209 kvm_set_cr3(vcpu, vmcs12->host_cr3);
7987 kvm_mmu_reset_context(vcpu); 8210 kvm_mmu_reset_context(vcpu);
7988 8211
@@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
8001 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 8224 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
8002 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 8225 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
8003 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 8226 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
8004 vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); 8227
8005 vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); 8228 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
8006 vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
8007 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
8008 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
8009 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
8010 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
8011 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
8012 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
8013 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
8014
8015 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
8016 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 8229 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
8230 vcpu->arch.pat = vmcs12->host_ia32_pat;
8231 }
8017 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 8232 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
8018 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 8233 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
8019 vmcs12->host_ia32_perf_global_ctrl); 8234 vmcs12->host_ia32_perf_global_ctrl);
8020 8235
8236 /* Set L1 segment info according to Intel SDM
8237 27.5.2 Loading Host Segment and Descriptor-Table Registers */
8238 seg = (struct kvm_segment) {
8239 .base = 0,
8240 .limit = 0xFFFFFFFF,
8241 .selector = vmcs12->host_cs_selector,
8242 .type = 11,
8243 .present = 1,
8244 .s = 1,
8245 .g = 1
8246 };
8247 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
8248 seg.l = 1;
8249 else
8250 seg.db = 1;
8251 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
8252 seg = (struct kvm_segment) {
8253 .base = 0,
8254 .limit = 0xFFFFFFFF,
8255 .type = 3,
8256 .present = 1,
8257 .s = 1,
8258 .db = 1,
8259 .g = 1
8260 };
8261 seg.selector = vmcs12->host_ds_selector;
8262 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
8263 seg.selector = vmcs12->host_es_selector;
8264 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
8265 seg.selector = vmcs12->host_ss_selector;
8266 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
8267 seg.selector = vmcs12->host_fs_selector;
8268 seg.base = vmcs12->host_fs_base;
8269 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
8270 seg.selector = vmcs12->host_gs_selector;
8271 seg.base = vmcs12->host_gs_base;
8272 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
8273 seg = (struct kvm_segment) {
8274 .base = vmcs12->host_tr_base,
8275 .limit = 0x67,
8276 .selector = vmcs12->host_tr_selector,
8277 .type = 11,
8278 .present = 1
8279 };
8280 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
8281
8021 kvm_set_dr(vcpu, 7, 0x400); 8282 kvm_set_dr(vcpu, 7, 0x400);
8022 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 8283 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
8023} 8284}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce505315..e5ca72a5cdb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
682 */ 682 */
683 } 683 }
684 684
685 /*
686 * Does the new cr3 value map to physical memory? (Note, we
687 * catch an invalid cr3 even in real-mode, because it would
688 * cause trouble later on when we turn on paging anyway.)
689 *
690 * A real CPU would silently accept an invalid cr3 and would
691 * attempt to use it - with largely undefined (and often hard
692 * to debug) behavior on the guest side.
693 */
694 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
695 return 1;
696 vcpu->arch.cr3 = cr3; 685 vcpu->arch.cr3 = cr3;
697 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 686 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
698 vcpu->arch.mmu.new_cr3(vcpu); 687 vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
850#ifdef CONFIG_X86_64 839#ifdef CONFIG_X86_64
851 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 840 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
852#endif 841#endif
853 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 842 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
843 MSR_IA32_FEATURE_CONTROL
854}; 844};
855 845
856static unsigned num_msrs_to_save; 846static unsigned num_msrs_to_save;
@@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1457#endif 1447#endif
1458} 1448}
1459 1449
1450static void kvm_gen_update_masterclock(struct kvm *kvm)
1451{
1452#ifdef CONFIG_X86_64
1453 int i;
1454 struct kvm_vcpu *vcpu;
1455 struct kvm_arch *ka = &kvm->arch;
1456
1457 spin_lock(&ka->pvclock_gtod_sync_lock);
1458 kvm_make_mclock_inprogress_request(kvm);
1459 /* no guest entries from this point */
1460 pvclock_update_vm_gtod_copy(kvm);
1461
1462 kvm_for_each_vcpu(i, vcpu, kvm)
1463 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1464
1465 /* guest entries allowed */
1466 kvm_for_each_vcpu(i, vcpu, kvm)
1467 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
1468
1469 spin_unlock(&ka->pvclock_gtod_sync_lock);
1470#endif
1471}
1472
1460static int kvm_guest_time_update(struct kvm_vcpu *v) 1473static int kvm_guest_time_update(struct kvm_vcpu *v)
1461{ 1474{
1462 unsigned long flags, this_tsc_khz; 1475 unsigned long flags, this_tsc_khz;
@@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
3806 delta = user_ns.clock - now_ns; 3819 delta = user_ns.clock - now_ns;
3807 local_irq_enable(); 3820 local_irq_enable();
3808 kvm->arch.kvmclock_offset = delta; 3821 kvm->arch.kvmclock_offset = delta;
3822 kvm_gen_update_masterclock(kvm);
3809 break; 3823 break;
3810 } 3824 }
3811 case KVM_GET_CLOCK: { 3825 case KVM_GET_CLOCK: {
@@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4955static int complete_emulated_mmio(struct kvm_vcpu *vcpu); 4969static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
4956static int complete_emulated_pio(struct kvm_vcpu *vcpu); 4970static int complete_emulated_pio(struct kvm_vcpu *vcpu);
4957 4971
4972static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
4973 unsigned long *db)
4974{
4975 u32 dr6 = 0;
4976 int i;
4977 u32 enable, rwlen;
4978
4979 enable = dr7;
4980 rwlen = dr7 >> 16;
4981 for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
4982 if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
4983 dr6 |= (1 << i);
4984 return dr6;
4985}
4986
4987static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
4988{
4989 struct kvm_run *kvm_run = vcpu->run;
4990
4991 /*
4992 * Use the "raw" value to see if TF was passed to the processor.
4993 * Note that the new value of the flags has not been saved yet.
4994 *
4995 * This is correct even for TF set by the guest, because "the
4996 * processor will not generate this exception after the instruction
4997 * that sets the TF flag".
4998 */
4999 unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
5000
5001 if (unlikely(rflags & X86_EFLAGS_TF)) {
5002 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
5003 kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
5004 kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
5005 kvm_run->debug.arch.exception = DB_VECTOR;
5006 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5007 *r = EMULATE_USER_EXIT;
5008 } else {
5009 vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
5010 /*
5011 * "Certain debug exceptions may clear bit 0-3. The
5012 * remaining contents of the DR6 register are never
5013 * cleared by the processor".
5014 */
5015 vcpu->arch.dr6 &= ~15;
5016 vcpu->arch.dr6 |= DR6_BS;
5017 kvm_queue_exception(vcpu, DB_VECTOR);
5018 }
5019 }
5020}
5021
5022static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
5023{
5024 struct kvm_run *kvm_run = vcpu->run;
5025 unsigned long eip = vcpu->arch.emulate_ctxt.eip;
5026 u32 dr6 = 0;
5027
5028 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
5029 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
5030 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5031 vcpu->arch.guest_debug_dr7,
5032 vcpu->arch.eff_db);
5033
5034 if (dr6 != 0) {
5035 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5036 kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
5037 get_segment_base(vcpu, VCPU_SREG_CS);
5038
5039 kvm_run->debug.arch.exception = DB_VECTOR;
5040 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5041 *r = EMULATE_USER_EXIT;
5042 return true;
5043 }
5044 }
5045
5046 if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
5047 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5048 vcpu->arch.dr7,
5049 vcpu->arch.db);
5050
5051 if (dr6 != 0) {
5052 vcpu->arch.dr6 &= ~15;
5053 vcpu->arch.dr6 |= dr6;
5054 kvm_queue_exception(vcpu, DB_VECTOR);
5055 *r = EMULATE_DONE;
5056 return true;
5057 }
5058 }
5059
5060 return false;
5061}
5062
4958int x86_emulate_instruction(struct kvm_vcpu *vcpu, 5063int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4959 unsigned long cr2, 5064 unsigned long cr2,
4960 int emulation_type, 5065 int emulation_type,
@@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4975 5080
4976 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 5081 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4977 init_emulate_ctxt(vcpu); 5082 init_emulate_ctxt(vcpu);
5083
5084 /*
5085 * We will reenter on the same instruction since
5086 * we do not set complete_userspace_io. This does not
5087 * handle watchpoints yet, those would be handled in
5088 * the emulate_ops.
5089 */
5090 if (kvm_vcpu_check_breakpoint(vcpu, &r))
5091 return r;
5092
4978 ctxt->interruptibility = 0; 5093 ctxt->interruptibility = 0;
4979 ctxt->have_exception = false; 5094 ctxt->have_exception = false;
4980 ctxt->perm_ok = false; 5095 ctxt->perm_ok = false;
@@ -5031,17 +5146,18 @@ restart:
5031 inject_emulated_exception(vcpu); 5146 inject_emulated_exception(vcpu);
5032 r = EMULATE_DONE; 5147 r = EMULATE_DONE;
5033 } else if (vcpu->arch.pio.count) { 5148 } else if (vcpu->arch.pio.count) {
5034 if (!vcpu->arch.pio.in) 5149 if (!vcpu->arch.pio.in) {
5150 /* FIXME: return into emulator if single-stepping. */
5035 vcpu->arch.pio.count = 0; 5151 vcpu->arch.pio.count = 0;
5036 else { 5152 } else {
5037 writeback = false; 5153 writeback = false;
5038 vcpu->arch.complete_userspace_io = complete_emulated_pio; 5154 vcpu->arch.complete_userspace_io = complete_emulated_pio;
5039 } 5155 }
5040 r = EMULATE_DO_MMIO; 5156 r = EMULATE_USER_EXIT;
5041 } else if (vcpu->mmio_needed) { 5157 } else if (vcpu->mmio_needed) {
5042 if (!vcpu->mmio_is_write) 5158 if (!vcpu->mmio_is_write)
5043 writeback = false; 5159 writeback = false;
5044 r = EMULATE_DO_MMIO; 5160 r = EMULATE_USER_EXIT;
5045 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 5161 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5046 } else if (r == EMULATION_RESTART) 5162 } else if (r == EMULATION_RESTART)
5047 goto restart; 5163 goto restart;
@@ -5050,10 +5166,12 @@ restart:
5050 5166
5051 if (writeback) { 5167 if (writeback) {
5052 toggle_interruptibility(vcpu, ctxt->interruptibility); 5168 toggle_interruptibility(vcpu, ctxt->interruptibility);
5053 kvm_set_rflags(vcpu, ctxt->eflags);
5054 kvm_make_request(KVM_REQ_EVENT, vcpu); 5169 kvm_make_request(KVM_REQ_EVENT, vcpu);
5055 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5170 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5056 kvm_rip_write(vcpu, ctxt->eip); 5171 kvm_rip_write(vcpu, ctxt->eip);
5172 if (r == EMULATE_DONE)
5173 kvm_vcpu_check_singlestep(vcpu, &r);
5174 kvm_set_rflags(vcpu, ctxt->eflags);
5057 } else 5175 } else
5058 vcpu->arch.emulate_regs_need_sync_to_vcpu = true; 5176 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
5059 5177
@@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
5347int kvm_arch_init(void *opaque) 5465int kvm_arch_init(void *opaque)
5348{ 5466{
5349 int r; 5467 int r;
5350 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 5468 struct kvm_x86_ops *ops = opaque;
5351 5469
5352 if (kvm_x86_ops) { 5470 if (kvm_x86_ops) {
5353 printk(KERN_ERR "kvm: already loaded the other module\n"); 5471 printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5495 return 1; 5613 return 1;
5496} 5614}
5497 5615
5616/*
5617 * kvm_pv_kick_cpu_op: Kick a vcpu.
5618 *
5619 * @apicid - apicid of vcpu to be kicked.
5620 */
5621static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5622{
5623 struct kvm_lapic_irq lapic_irq;
5624
5625 lapic_irq.shorthand = 0;
5626 lapic_irq.dest_mode = 0;
5627 lapic_irq.dest_id = apicid;
5628
5629 lapic_irq.delivery_mode = APIC_DM_REMRD;
5630 kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
5631}
5632
5498int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 5633int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5499{ 5634{
5500 unsigned long nr, a0, a1, a2, a3, ret; 5635 unsigned long nr, a0, a1, a2, a3, ret;
@@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5528 case KVM_HC_VAPIC_POLL_IRQ: 5663 case KVM_HC_VAPIC_POLL_IRQ:
5529 ret = 0; 5664 ret = 0;
5530 break; 5665 break;
5666 case KVM_HC_KICK_CPU:
5667 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
5668 ret = 0;
5669 break;
5531 default: 5670 default:
5532 ret = -KVM_ENOSYS; 5671 ret = -KVM_ENOSYS;
5533 break; 5672 break;
@@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5689 kvm_make_request(KVM_REQ_EVENT, vcpu); 5828 kvm_make_request(KVM_REQ_EVENT, vcpu);
5690} 5829}
5691 5830
5692static void kvm_gen_update_masterclock(struct kvm *kvm)
5693{
5694#ifdef CONFIG_X86_64
5695 int i;
5696 struct kvm_vcpu *vcpu;
5697 struct kvm_arch *ka = &kvm->arch;
5698
5699 spin_lock(&ka->pvclock_gtod_sync_lock);
5700 kvm_make_mclock_inprogress_request(kvm);
5701 /* no guest entries from this point */
5702 pvclock_update_vm_gtod_copy(kvm);
5703
5704 kvm_for_each_vcpu(i, vcpu, kvm)
5705 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5706
5707 /* guest entries allowed */
5708 kvm_for_each_vcpu(i, vcpu, kvm)
5709 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5710
5711 spin_unlock(&ka->pvclock_gtod_sync_lock);
5712#endif
5713}
5714
5715static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 5831static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5716{ 5832{
5717 u64 eoi_exit_bitmap[4]; 5833 u64 eoi_exit_bitmap[4];
@@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5950 kvm_apic_accept_events(vcpu); 6066 kvm_apic_accept_events(vcpu);
5951 switch(vcpu->arch.mp_state) { 6067 switch(vcpu->arch.mp_state) {
5952 case KVM_MP_STATE_HALTED: 6068 case KVM_MP_STATE_HALTED:
6069 vcpu->arch.pv.pv_unhalted = false;
5953 vcpu->arch.mp_state = 6070 vcpu->arch.mp_state =
5954 KVM_MP_STATE_RUNNABLE; 6071 KVM_MP_STATE_RUNNABLE;
5955 case KVM_MP_STATE_RUNNABLE: 6072 case KVM_MP_STATE_RUNNABLE:
@@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
6061 6178
6062 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { 6179 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
6063 vcpu->mmio_needed = 0; 6180 vcpu->mmio_needed = 0;
6181
6182 /* FIXME: return into emulator if single-stepping. */
6064 if (vcpu->mmio_is_write) 6183 if (vcpu->mmio_is_write)
6065 return 1; 6184 return 1;
6066 vcpu->mmio_read_completed = 1; 6185 vcpu->mmio_read_completed = 1;
@@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6249 struct kvm_mp_state *mp_state) 6368 struct kvm_mp_state *mp_state)
6250{ 6369{
6251 kvm_apic_accept_events(vcpu); 6370 kvm_apic_accept_events(vcpu);
6252 mp_state->mp_state = vcpu->arch.mp_state; 6371 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
6372 vcpu->arch.pv.pv_unhalted)
6373 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
6374 else
6375 mp_state->mp_state = vcpu->arch.mp_state;
6376
6253 return 0; 6377 return 0;
6254} 6378}
6255 6379
@@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6770 BUG_ON(vcpu->kvm == NULL); 6894 BUG_ON(vcpu->kvm == NULL);
6771 kvm = vcpu->kvm; 6895 kvm = vcpu->kvm;
6772 6896
6897 vcpu->arch.pv.pv_unhalted = false;
6773 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 6898 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6774 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 6899 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
6775 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 6900 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -7019,6 +7144,15 @@ out_free:
7019 return -ENOMEM; 7144 return -ENOMEM;
7020} 7145}
7021 7146
7147void kvm_arch_memslots_updated(struct kvm *kvm)
7148{
7149 /*
7150 * memslots->generation has been incremented.
7151 * mmio generation may have reached its maximum value.
7152 */
7153 kvm_mmu_invalidate_mmio_sptes(kvm);
7154}
7155
7022int kvm_arch_prepare_memory_region(struct kvm *kvm, 7156int kvm_arch_prepare_memory_region(struct kvm *kvm,
7023 struct kvm_memory_slot *memslot, 7157 struct kvm_memory_slot *memslot,
7024 struct kvm_userspace_memory_region *mem, 7158 struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7079 */ 7213 */
7080 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7214 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
7081 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7215 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
7082 /*
7083 * If memory slot is created, or moved, we need to clear all
7084 * mmio sptes.
7085 */
7086 kvm_mmu_invalidate_mmio_sptes(kvm);
7087} 7216}
7088 7217
7089void kvm_arch_flush_shadow_all(struct kvm *kvm) 7218void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
7103 !vcpu->arch.apf.halted) 7232 !vcpu->arch.apf.halted)
7104 || !list_empty_careful(&vcpu->async_pf.done) 7233 || !list_empty_careful(&vcpu->async_pf.done)
7105 || kvm_apic_has_events(vcpu) 7234 || kvm_apic_has_events(vcpu)
7235 || vcpu->arch.pv.pv_unhalted
7106 || atomic_read(&vcpu->arch.nmi_queued) || 7236 || atomic_read(&vcpu->arch.nmi_queued) ||
7107 (kvm_arch_interrupt_allowed(vcpu) && 7237 (kvm_arch_interrupt_allowed(vcpu) &&
7108 kvm_cpu_has_interrupt(vcpu)); 7238 kvm_cpu_has_interrupt(vcpu));
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 25b7ae8d058a..7609e0e421ec 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -6,6 +6,7 @@
6 */ 6 */
7#include <asm/checksum.h> 7#include <asm/checksum.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <asm/smap.h>
9 10
10/** 11/**
11 * csum_partial_copy_from_user - Copy and checksum from user space. 12 * csum_partial_copy_from_user - Copy and checksum from user space.
@@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst,
52 len -= 2; 53 len -= 2;
53 } 54 }
54 } 55 }
56 stac();
55 isum = csum_partial_copy_generic((__force const void *)src, 57 isum = csum_partial_copy_generic((__force const void *)src,
56 dst, len, isum, errp, NULL); 58 dst, len, isum, errp, NULL);
59 clac();
57 if (unlikely(*errp)) 60 if (unlikely(*errp))
58 goto out_err; 61 goto out_err;
59 62
@@ -82,6 +85,8 @@ __wsum
82csum_partial_copy_to_user(const void *src, void __user *dst, 85csum_partial_copy_to_user(const void *src, void __user *dst,
83 int len, __wsum isum, int *errp) 86 int len, __wsum isum, int *errp)
84{ 87{
88 __wsum ret;
89
85 might_sleep(); 90 might_sleep();
86 91
87 if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { 92 if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
@@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst,
105 } 110 }
106 111
107 *errp = 0; 112 *errp = 0;
108 return csum_partial_copy_generic(src, (void __force *)dst, 113 stac();
109 len, isum, NULL, errp); 114 ret = csum_partial_copy_generic(src, (void __force *)dst,
115 len, isum, NULL, errp);
116 clac();
117 return ret;
110} 118}
111EXPORT_SYMBOL(csum_partial_copy_to_user); 119EXPORT_SYMBOL(csum_partial_copy_to_user);
112 120
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 906fea315791..c905e89e19fe 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(copy_in_user);
68 * Since protection fault in copy_from/to_user is not a normal situation, 68 * Since protection fault in copy_from/to_user is not a normal situation,
69 * it is not necessary to optimize tail handling. 69 * it is not necessary to optimize tail handling.
70 */ 70 */
71unsigned long 71__visible unsigned long
72copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) 72copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
73{ 73{
74 char c; 74 char c;
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 5d7e51f3fd28..533a85e3a07e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1,10 +1,8 @@
1# x86 Opcode Maps 1# x86 Opcode Maps
2# 2#
3# This is (mostly) based on following documentations. 3# This is (mostly) based on following documentations.
4# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 4# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C
5# (#325383-040US, October 2011) 5# (#326018-047US, June 2013)
6# - Intel(R) Advanced Vector Extensions Programming Reference
7# (#319433-011,JUNE 2011).
8# 6#
9#<Opcode maps> 7#<Opcode maps>
10# Table: table-name 8# Table: table-name
@@ -29,6 +27,7 @@
29# - (F3): the last prefix is 0xF3 27# - (F3): the last prefix is 0xF3
30# - (F2): the last prefix is 0xF2 28# - (F2): the last prefix is 0xF2
31# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) 29# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
30# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
32 31
33Table: one byte opcode 32Table: one byte opcode
34Referrer: 33Referrer:
@@ -246,8 +245,8 @@ c2: RETN Iw (f64)
246c3: RETN 245c3: RETN
247c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) 246c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
248c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) 247c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
249c6: Grp11 Eb,Ib (1A) 248c6: Grp11A Eb,Ib (1A)
250c7: Grp11 Ev,Iz (1A) 249c7: Grp11B Ev,Iz (1A)
251c8: ENTER Iw,Ib 250c8: ENTER Iw,Ib
252c9: LEAVE (d64) 251c9: LEAVE (d64)
253ca: RETF Iw 252ca: RETF Iw
@@ -293,8 +292,8 @@ ef: OUT DX,eAX
293# 0xf0 - 0xff 292# 0xf0 - 0xff
294f0: LOCK (Prefix) 293f0: LOCK (Prefix)
295f1: 294f1:
296f2: REPNE (Prefix) 295f2: REPNE (Prefix) | XACQUIRE (Prefix)
297f3: REP/REPE (Prefix) 296f3: REP/REPE (Prefix) | XRELEASE (Prefix)
298f4: HLT 297f4: HLT
299f5: CMC 298f5: CMC
300f6: Grp3_1 Eb (1A) 299f6: Grp3_1 Eb (1A)
@@ -326,7 +325,8 @@ AVXcode: 1
3260a: 3250a:
3270b: UD2 (1B) 3260b: UD2 (1B)
3280c: 3270c:
3290d: NOP Ev | GrpP 328# AMD's prefetch group. Intel supports prefetchw(/1) only.
3290d: GrpP
3300e: FEMMS 3300e: FEMMS
331# 3DNow! uses the last imm byte as opcode extension. 331# 3DNow! uses the last imm byte as opcode extension.
3320f: 3DNow! Pq,Qq,Ib 3320f: 3DNow! Pq,Qq,Ib
@@ -729,12 +729,12 @@ dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
729dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) 729dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
730de: VAESDEC Vdq,Hdq,Wdq (66),(v1) 730de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
731df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) 731df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
732f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) 732f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
733f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) 733f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
734f2: ANDN Gy,By,Ey (v) 734f2: ANDN Gy,By,Ey (v)
735f3: Grp17 (1A) 735f3: Grp17 (1A)
736f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) 736f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
737f6: MULX By,Gy,rDX,Ey (F2),(v) 737f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
738f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) 738f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
739EndTable 739EndTable
740 740
@@ -861,8 +861,8 @@ EndTable
861 861
862GrpTable: Grp7 862GrpTable: Grp7
8630: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) 8630: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
8641: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) 8641: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B)
8652: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) 8652: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
8663: LIDT Ms 8663: LIDT Ms
8674: SMSW Mw/Rv 8674: SMSW Mw/Rv
8685: 8685:
@@ -880,15 +880,21 @@ EndTable
880GrpTable: Grp9 880GrpTable: Grp9
8811: CMPXCHG8B/16B Mq/Mdq 8811: CMPXCHG8B/16B Mq/Mdq
8826: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) 8826: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
8837: VMPTRST Mq | VMPTRST Mq (F3) 8837: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B)
884EndTable 884EndTable
885 885
886GrpTable: Grp10 886GrpTable: Grp10
887EndTable 887EndTable
888 888
889GrpTable: Grp11 889# Grp11A and Grp11B are expressed as Grp11 in Intel SDM
890# Note: the operands are given by group opcode 890GrpTable: Grp11A
8910: MOV 8910: MOV Eb,Ib
8927: XABORT Ib (000),(11B)
893EndTable
894
895GrpTable: Grp11B
8960: MOV Eb,Iz
8977: XBEGIN Jz (000),(11B)
892EndTable 898EndTable
893 899
894GrpTable: Grp12 900GrpTable: Grp12
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 2ec29ac78ae6..04664cdb7fda 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -78,8 +78,8 @@ __ref void *alloc_low_pages(unsigned int num)
78 return __va(pfn << PAGE_SHIFT); 78 return __va(pfn << PAGE_SHIFT);
79} 79}
80 80
81/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ 81/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
82#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) 82#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
83RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); 83RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
84void __init early_alloc_pgt_buf(void) 84void __init early_alloc_pgt_buf(void)
85{ 85{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0215e2c563ef..799580cabc78 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -487,7 +487,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
487 unsigned long offset; 487 unsigned long offset;
488 resource_size_t last_addr; 488 resource_size_t last_addr;
489 unsigned int nrpages; 489 unsigned int nrpages;
490 enum fixed_addresses idx0, idx; 490 enum fixed_addresses idx;
491 int i, slot; 491 int i, slot;
492 492
493 WARN_ON(system_state != SYSTEM_BOOTING); 493 WARN_ON(system_state != SYSTEM_BOOTING);
@@ -540,8 +540,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
540 /* 540 /*
541 * Ok, go for it.. 541 * Ok, go for it..
542 */ 542 */
543 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; 543 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
544 idx = idx0;
545 while (nrpages > 0) { 544 while (nrpages > 0) {
546 early_set_fixmap(idx, phys_addr, prot); 545 early_set_fixmap(idx, phys_addr, prot);
547 phys_addr += PAGE_SIZE; 546 phys_addr += PAGE_SIZE;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 62c29a5bfe26..25e7e1372bb2 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -112,11 +112,13 @@ static unsigned long mmap_legacy_base(void)
112 */ 112 */
113void arch_pick_mmap_layout(struct mm_struct *mm) 113void arch_pick_mmap_layout(struct mm_struct *mm)
114{ 114{
115 mm->mmap_legacy_base = mmap_legacy_base();
116 mm->mmap_base = mmap_base();
117
115 if (mmap_is_legacy()) { 118 if (mmap_is_legacy()) {
116 mm->mmap_base = mmap_legacy_base(); 119 mm->mmap_base = mm->mmap_legacy_base;
117 mm->get_unmapped_area = arch_get_unmapped_area; 120 mm->get_unmapped_area = arch_get_unmapped_area;
118 } else { 121 } else {
119 mm->mmap_base = mmap_base();
120 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 122 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
121 } 123 }
122} 124}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index cdd0da9dd530..266ca912f62e 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -146,6 +146,7 @@ int __init
146acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 146acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
147{ 147{
148 u64 start, end; 148 u64 start, end;
149 u32 hotpluggable;
149 int node, pxm; 150 int node, pxm;
150 151
151 if (srat_disabled()) 152 if (srat_disabled())
@@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
154 goto out_err_bad_srat; 155 goto out_err_bad_srat;
155 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) 156 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
156 goto out_err; 157 goto out_err;
157 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) 158 hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
159 if (hotpluggable && !save_add_info())
158 goto out_err; 160 goto out_err;
159 161
160 start = ma->base_address; 162 start = ma->base_address;
@@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
174 176
175 node_set(node, numa_nodes_parsed); 177 node_set(node, numa_nodes_parsed);
176 178
177 printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", 179 pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
178 node, pxm, 180 node, pxm,
179 (unsigned long long) start, (unsigned long long) end - 1); 181 (unsigned long long) start, (unsigned long long) end - 1,
182 hotpluggable ? " hotplug" : "");
180 183
181 return 0; 184 return 0;
182out_err_bad_srat: 185out_err_bad_srat:
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 48768df2471a..6890d8498e0b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -403,7 +403,7 @@ static void nmi_cpu_down(void *dummy)
403 nmi_cpu_shutdown(dummy); 403 nmi_cpu_shutdown(dummy);
404} 404}
405 405
406static int nmi_create_files(struct super_block *sb, struct dentry *root) 406static int nmi_create_files(struct dentry *root)
407{ 407{
408 unsigned int i; 408 unsigned int i;
409 409
@@ -420,14 +420,14 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
420 continue; 420 continue;
421 421
422 snprintf(buf, sizeof(buf), "%d", i); 422 snprintf(buf, sizeof(buf), "%d", i);
423 dir = oprofilefs_mkdir(sb, root, buf); 423 dir = oprofilefs_mkdir(root, buf);
424 oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); 424 oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
425 oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); 425 oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
426 oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); 426 oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
427 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 427 oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
428 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 428 oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
429 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 429 oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
430 oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); 430 oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
431 } 431 }
432 432
433 return 0; 433 return 0;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b2b94438ff05..50d86c0e9ba4 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -454,16 +454,16 @@ static void init_ibs(void)
454 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); 454 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
455} 455}
456 456
457static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 457static int (*create_arch_files)(struct dentry *root);
458 458
459static int setup_ibs_files(struct super_block *sb, struct dentry *root) 459static int setup_ibs_files(struct dentry *root)
460{ 460{
461 struct dentry *dir; 461 struct dentry *dir;
462 int ret = 0; 462 int ret = 0;
463 463
464 /* architecture specific files */ 464 /* architecture specific files */
465 if (create_arch_files) 465 if (create_arch_files)
466 ret = create_arch_files(sb, root); 466 ret = create_arch_files(root);
467 467
468 if (ret) 468 if (ret)
469 return ret; 469 return ret;
@@ -479,26 +479,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
479 ibs_config.max_cnt_op = 250000; 479 ibs_config.max_cnt_op = 250000;
480 480
481 if (ibs_caps & IBS_CAPS_FETCHSAM) { 481 if (ibs_caps & IBS_CAPS_FETCHSAM) {
482 dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); 482 dir = oprofilefs_mkdir(root, "ibs_fetch");
483 oprofilefs_create_ulong(sb, dir, "enable", 483 oprofilefs_create_ulong(dir, "enable",
484 &ibs_config.fetch_enabled); 484 &ibs_config.fetch_enabled);
485 oprofilefs_create_ulong(sb, dir, "max_count", 485 oprofilefs_create_ulong(dir, "max_count",
486 &ibs_config.max_cnt_fetch); 486 &ibs_config.max_cnt_fetch);
487 oprofilefs_create_ulong(sb, dir, "rand_enable", 487 oprofilefs_create_ulong(dir, "rand_enable",
488 &ibs_config.rand_en); 488 &ibs_config.rand_en);
489 } 489 }
490 490
491 if (ibs_caps & IBS_CAPS_OPSAM) { 491 if (ibs_caps & IBS_CAPS_OPSAM) {
492 dir = oprofilefs_mkdir(sb, root, "ibs_op"); 492 dir = oprofilefs_mkdir(root, "ibs_op");
493 oprofilefs_create_ulong(sb, dir, "enable", 493 oprofilefs_create_ulong(dir, "enable",
494 &ibs_config.op_enabled); 494 &ibs_config.op_enabled);
495 oprofilefs_create_ulong(sb, dir, "max_count", 495 oprofilefs_create_ulong(dir, "max_count",
496 &ibs_config.max_cnt_op); 496 &ibs_config.max_cnt_op);
497 if (ibs_caps & IBS_CAPS_OPCNT) 497 if (ibs_caps & IBS_CAPS_OPCNT)
498 oprofilefs_create_ulong(sb, dir, "dispatched_ops", 498 oprofilefs_create_ulong(dir, "dispatched_ops",
499 &ibs_config.dispatched_ops); 499 &ibs_config.dispatched_ops);
500 if (ibs_caps & IBS_CAPS_BRNTRGT) 500 if (ibs_caps & IBS_CAPS_BRNTRGT)
501 oprofilefs_create_ulong(sb, dir, "branch_target", 501 oprofilefs_create_ulong(dir, "branch_target",
502 &ibs_config.branch_target); 502 &ibs_config.branch_target);
503 } 503 }
504 504
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d641897a1f4e..b30e937689d6 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -568,13 +568,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
568 */ 568 */
569 if (bus) { 569 if (bus) {
570 struct pci_bus *child; 570 struct pci_bus *child;
571 list_for_each_entry(child, &bus->children, node) { 571 list_for_each_entry(child, &bus->children, node)
572 struct pci_dev *self = child->self; 572 pcie_bus_configure_settings(child);
573 if (!self)
574 continue;
575
576 pcie_bus_configure_settings(child, self->pcie_mpss);
577 }
578 } 573 }
579 574
580 if (bus && node != -1) { 575 if (bus && node != -1) {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 94919e307f8e..db6b1ab43255 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -210,6 +210,8 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev)
210 r = &dev->resource[idx]; 210 r = &dev->resource[idx];
211 if (!r->flags) 211 if (!r->flags)
212 continue; 212 continue;
213 if (r->parent) /* Already allocated */
214 continue;
213 if (!r->start || pci_claim_resource(dev, idx) < 0) { 215 if (!r->start || pci_claim_resource(dev, idx) < 0) {
214 /* 216 /*
215 * Something is wrong with the region. 217 * Something is wrong with the region.
@@ -318,6 +320,8 @@ static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev)
318 r = &dev->resource[PCI_ROM_RESOURCE]; 320 r = &dev->resource[PCI_ROM_RESOURCE];
319 if (!r->flags || !r->start) 321 if (!r->flags || !r->start)
320 return; 322 return;
323 if (r->parent) /* Already allocated */
324 return;
321 325
322 if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { 326 if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) {
323 r->end -= r->start; 327 r->end -= r->start;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 082e88129712..5596c7bdd327 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -700,7 +700,7 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
700 if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed) 700 if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
701 return -ENODEV; 701 return -ENODEV;
702 702
703 if (start > end) 703 if (start > end || !addr)
704 return -EINVAL; 704 return -EINVAL;
705 705
706 mutex_lock(&pci_mmcfg_lock); 706 mutex_lock(&pci_mmcfg_lock);
@@ -716,11 +716,6 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
716 return -EEXIST; 716 return -EEXIST;
717 } 717 }
718 718
719 if (!addr) {
720 mutex_unlock(&pci_mmcfg_lock);
721 return -EINVAL;
722 }
723
724 rc = -EBUSY; 719 rc = -EBUSY;
725 cfg = pci_mmconfig_alloc(seg, start, end, addr); 720 cfg = pci_mmconfig_alloc(seg, start, end, addr);
726 if (cfg == NULL) { 721 if (cfg == NULL) {
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 6eb18c42a28a..903fded50786 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -23,11 +23,11 @@
23#include <linux/ioport.h> 23#include <linux/ioport.h>
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/dmi.h> 25#include <linux/dmi.h>
26#include <linux/acpi.h>
27#include <linux/io.h>
28#include <linux/smp.h>
26 29
27#include <asm/acpi.h>
28#include <asm/segment.h> 30#include <asm/segment.h>
29#include <asm/io.h>
30#include <asm/smp.h>
31#include <asm/pci_x86.h> 31#include <asm/pci_x86.h>
32#include <asm/hw_irq.h> 32#include <asm/hw_irq.h>
33#include <asm/io_apic.h> 33#include <asm/io_apic.h>
@@ -43,7 +43,7 @@
43#define PCI_FIXED_BAR_4_SIZE 0x14 43#define PCI_FIXED_BAR_4_SIZE 0x14
44#define PCI_FIXED_BAR_5_SIZE 0x1c 44#define PCI_FIXED_BAR_5_SIZE 0x1c
45 45
46static int pci_soc_mode = 0; 46static int pci_soc_mode;
47 47
48/** 48/**
49 * fixed_bar_cap - return the offset of the fixed BAR cap if found 49 * fixed_bar_cap - return the offset of the fixed BAR cap if found
@@ -141,7 +141,8 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
141 */ 141 */
142static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) 142static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
143{ 143{
144 /* This is a workaround for A0 LNC bug where PCI status register does 144 /*
145 * This is a workaround for A0 LNC bug where PCI status register does
145 * not have new CAP bit set. can not be written by SW either. 146 * not have new CAP bit set. can not be written by SW either.
146 * 147 *
147 * PCI header type in real LNC indicates a single function device, this 148 * PCI header type in real LNC indicates a single function device, this
@@ -154,7 +155,7 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
154 || devfn == PCI_DEVFN(0, 0) 155 || devfn == PCI_DEVFN(0, 0)
155 || devfn == PCI_DEVFN(3, 0))) 156 || devfn == PCI_DEVFN(3, 0)))
156 return 1; 157 return 1;
157 return 0; /* langwell on others */ 158 return 0; /* Langwell on others */
158} 159}
159 160
160static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, 161static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
@@ -172,7 +173,8 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
172{ 173{
173 int offset; 174 int offset;
174 175
175 /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read 176 /*
177 * On MRST, there is no PCI ROM BAR, this will cause a subsequent read
176 * to ROM BAR return 0 then being ignored. 178 * to ROM BAR return 0 then being ignored.
177 */ 179 */
178 if (where == PCI_ROM_ADDRESS) 180 if (where == PCI_ROM_ADDRESS)
@@ -210,7 +212,8 @@ static int mrst_pci_irq_enable(struct pci_dev *dev)
210 212
211 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 213 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
212 214
213 /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to 215 /*
216 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
214 * IOAPIC RTE entries, so we just enable RTE for the device. 217 * IOAPIC RTE entries, so we just enable RTE for the device.
215 */ 218 */
216 irq_attr.ioapic = mp_find_ioapic(dev->irq); 219 irq_attr.ioapic = mp_find_ioapic(dev->irq);
@@ -235,7 +238,7 @@ struct pci_ops pci_mrst_ops = {
235 */ 238 */
236int __init pci_mrst_init(void) 239int __init pci_mrst_init(void)
237{ 240{
238 printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n"); 241 pr_info("Intel MID platform detected, using MID PCI ops\n");
239 pci_mmcfg_late_init(); 242 pci_mmcfg_late_init();
240 pcibios_enable_irq = mrst_pci_irq_enable; 243 pcibios_enable_irq = mrst_pci_irq_enable;
241 pci_root_ops = pci_mrst_ops; 244 pci_root_ops = pci_mrst_ops;
@@ -244,17 +247,21 @@ int __init pci_mrst_init(void)
244 return 1; 247 return 1;
245} 248}
246 249
247/* Langwell devices are not true pci devices, they are not subject to 10 ms 250/*
248 * d3 to d0 delay required by pci spec. 251 * Langwell devices are not true PCI devices; they are not subject to 10 ms
252 * d3 to d0 delay required by PCI spec.
249 */ 253 */
250static void pci_d3delay_fixup(struct pci_dev *dev) 254static void pci_d3delay_fixup(struct pci_dev *dev)
251{ 255{
252 /* PCI fixups are effectively decided compile time. If we have a dual 256 /*
253 SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */ 257 * PCI fixups are effectively decided compile time. If we have a dual
254 if (!pci_soc_mode) 258 * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices.
255 return; 259 */
256 /* true pci devices in lincroft should allow type 1 access, the rest 260 if (!pci_soc_mode)
257 * are langwell fake pci devices. 261 return;
262 /*
263 * True PCI devices in Lincroft should allow type 1 access, the rest
264 * are Langwell fake PCI devices.
258 */ 265 */
259 if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) 266 if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID))
260 return; 267 return;
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index 643b8b5eee86..8244f5ec2f4c 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/reboot.h>
15#include <linux/serial_reg.h> 16#include <linux/serial_reg.h>
16#include <linux/serial_8250.h> 17#include <linux/serial_8250.h>
17#include <linux/reboot.h> 18#include <linux/reboot.h>
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 1cf5b300305e..424f4c97a44d 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -25,10 +25,10 @@
25#include <asm/cpu.h> 25#include <asm/cpu.h>
26 26
27#ifdef CONFIG_X86_32 27#ifdef CONFIG_X86_32
28unsigned long saved_context_ebx; 28__visible unsigned long saved_context_ebx;
29unsigned long saved_context_esp, saved_context_ebp; 29__visible unsigned long saved_context_esp, saved_context_ebp;
30unsigned long saved_context_esi, saved_context_edi; 30__visible unsigned long saved_context_esi, saved_context_edi;
31unsigned long saved_context_eflags; 31__visible unsigned long saved_context_eflags;
32#endif 32#endif
33struct saved_context saved_context; 33struct saved_context saved_context;
34 34
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a0fde91c16cf..304fca20d96e 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -20,26 +20,26 @@
20#include <asm/suspend.h> 20#include <asm/suspend.h>
21 21
22/* References to section boundaries */ 22/* References to section boundaries */
23extern const void __nosave_begin, __nosave_end; 23extern __visible const void __nosave_begin, __nosave_end;
24 24
25/* Defined in hibernate_asm_64.S */ 25/* Defined in hibernate_asm_64.S */
26extern int restore_image(void); 26extern asmlinkage int restore_image(void);
27 27
28/* 28/*
29 * Address to jump to in the last phase of restore in order to get to the image 29 * Address to jump to in the last phase of restore in order to get to the image
30 * kernel's text (this value is passed in the image header). 30 * kernel's text (this value is passed in the image header).
31 */ 31 */
32unsigned long restore_jump_address; 32unsigned long restore_jump_address __visible;
33 33
34/* 34/*
35 * Value of the cr3 register from before the hibernation (this value is passed 35 * Value of the cr3 register from before the hibernation (this value is passed
36 * in the image header). 36 * in the image header).
37 */ 37 */
38unsigned long restore_cr3; 38unsigned long restore_cr3 __visible;
39 39
40pgd_t *temp_level4_pgt; 40pgd_t *temp_level4_pgt __visible;
41 41
42void *relocated_restore_code; 42void *relocated_restore_code __visible;
43 43
44static void *alloc_pgt_page(void *context) 44static void *alloc_pgt_page(void *context)
45{ 45{
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index e6773dc8ac41..093a892026f9 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -68,7 +68,7 @@ BEGIN {
68 68
69 lprefix1_expr = "\\((66|!F3)\\)" 69 lprefix1_expr = "\\((66|!F3)\\)"
70 lprefix2_expr = "\\(F3\\)" 70 lprefix2_expr = "\\(F3\\)"
71 lprefix3_expr = "\\((F2|!F3)\\)" 71 lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)"
72 lprefix_expr = "\\((66|F2|F3)\\)" 72 lprefix_expr = "\\((66|F2|F3)\\)"
73 max_lprefix = 4 73 max_lprefix = 4
74 74
@@ -83,6 +83,8 @@ BEGIN {
83 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" 83 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
84 prefix_num["REPNE"] = "INAT_PFX_REPNE" 84 prefix_num["REPNE"] = "INAT_PFX_REPNE"
85 prefix_num["REP/REPE"] = "INAT_PFX_REPE" 85 prefix_num["REP/REPE"] = "INAT_PFX_REPE"
86 prefix_num["XACQUIRE"] = "INAT_PFX_REPNE"
87 prefix_num["XRELEASE"] = "INAT_PFX_REPE"
86 prefix_num["LOCK"] = "INAT_PFX_LOCK" 88 prefix_num["LOCK"] = "INAT_PFX_LOCK"
87 prefix_num["SEG=CS"] = "INAT_PFX_CS" 89 prefix_num["SEG=CS"] = "INAT_PFX_CS"
88 prefix_num["SEG=DS"] = "INAT_PFX_DS" 90 prefix_num["SEG=DS"] = "INAT_PFX_DS"
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index c74436e687bf..72074d528400 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
85 cycle_t ret; 85 cycle_t ret;
86 u64 last; 86 u64 last;
87 u32 version; 87 u32 version;
88 u32 migrate_count;
89 u8 flags; 88 u8 flags;
90 unsigned cpu, cpu1; 89 unsigned cpu, cpu1;
91 90
92 91
93 /* 92 /*
94 * When looping to get a consistent (time-info, tsc) pair, we 93 * Note: hypervisor must guarantee that:
95 * also need to deal with the possibility we can switch vcpus, 94 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
96 * so make sure we always re-fetch time-info for the current vcpu. 95 * 2. that per-CPU pvclock time info is updated if the
96 * underlying CPU changes.
97 * 3. that version is increased whenever underlying CPU
98 * changes.
99 *
97 */ 100 */
98 do { 101 do {
99 cpu = __getcpu() & VGETCPU_CPU_MASK; 102 cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
104 107
105 pvti = get_pvti(cpu); 108 pvti = get_pvti(cpu);
106 109
107 migrate_count = pvti->migrate_count;
108
109 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); 110 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
110 111
111 /* 112 /*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
117 cpu1 = __getcpu() & VGETCPU_CPU_MASK; 118 cpu1 = __getcpu() & VGETCPU_CPU_MASK;
118 } while (unlikely(cpu != cpu1 || 119 } while (unlikely(cpu != cpu1 ||
119 (pvti->pvti.version & 1) || 120 (pvti->pvti.version & 1) ||
120 pvti->pvti.version != version || 121 pvti->pvti.version != version));
121 pvti->migrate_count != migrate_count));
122 122
123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) 123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
124 *mode = VCLOCK_NONE; 124 *mode = VCLOCK_NONE;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 193097ef3d7d..2fc216dfbd9c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void)
427 427
428 if (!xen_initial_domain()) 428 if (!xen_initial_domain())
429 cpuid_leaf1_edx_mask &= 429 cpuid_leaf1_edx_mask &=
430 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 430 ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
431 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
432 431
433 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); 432 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
434 433
@@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
735 addr = (unsigned long)xen_int3; 734 addr = (unsigned long)xen_int3;
736 else if (addr == (unsigned long)stack_segment) 735 else if (addr == (unsigned long)stack_segment)
737 addr = (unsigned long)xen_stack_segment; 736 addr = (unsigned long)xen_stack_segment;
738 else if (addr == (unsigned long)double_fault || 737 else if (addr == (unsigned long)double_fault) {
739 addr == (unsigned long)nmi) {
740 /* Don't need to handle these */ 738 /* Don't need to handle these */
741 return 0; 739 return 0;
742#ifdef CONFIG_X86_MCE 740#ifdef CONFIG_X86_MCE
@@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
747 */ 745 */
748 ; 746 ;
749#endif 747#endif
750 } else { 748 } else if (addr == (unsigned long)nmi)
749 /*
750 * Use the native version as well.
751 */
752 ;
753 else {
751 /* Some other trap using IST? */ 754 /* Some other trap using IST? */
752 if (WARN_ON(val->ist != 0)) 755 if (WARN_ON(val->ist != 0))
753 return 0; 756 return 0;
@@ -1710,6 +1713,8 @@ static void __init xen_hvm_guest_init(void)
1710 1713
1711 xen_hvm_init_shared_info(); 1714 xen_hvm_init_shared_info();
1712 1715
1716 xen_panic_handler_init();
1717
1713 if (xen_feature(XENFEAT_hvm_callback_vector)) 1718 if (xen_feature(XENFEAT_hvm_callback_vector))
1714 xen_have_vector_callback = 1; 1719 xen_have_vector_callback = 1;
1715 xen_hvm_smp_init(); 1720 xen_hvm_smp_init();
@@ -1720,15 +1725,12 @@ static void __init xen_hvm_guest_init(void)
1720 xen_hvm_init_mmu_ops(); 1725 xen_hvm_init_mmu_ops();
1721} 1726}
1722 1727
1723static bool __init xen_hvm_platform(void) 1728static uint32_t __init xen_hvm_platform(void)
1724{ 1729{
1725 if (xen_pv_domain()) 1730 if (xen_pv_domain())
1726 return false; 1731 return 0;
1727
1728 if (!xen_cpuid_base())
1729 return false;
1730 1732
1731 return true; 1733 return xen_cpuid_base();
1732} 1734}
1733 1735
1734bool xen_hvm_need_lapic(void) 1736bool xen_hvm_need_lapic(void)
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 01a4dc015ae1..0da7f863056f 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags)
47 /* convert from IF type flag */ 47 /* convert from IF type flag */
48 flags = !(flags & X86_EFLAGS_IF); 48 flags = !(flags & X86_EFLAGS_IF);
49 49
50 /* There's a one instruction preempt window here. We need to 50 /* See xen_irq_enable() for why preemption must be disabled. */
51 make sure we're don't switch CPUs between getting the vcpu
52 pointer and updating the mask. */
53 preempt_disable(); 51 preempt_disable();
54 vcpu = this_cpu_read(xen_vcpu); 52 vcpu = this_cpu_read(xen_vcpu);
55 vcpu->evtchn_upcall_mask = flags; 53 vcpu->evtchn_upcall_mask = flags;
56 preempt_enable_no_resched();
57
58 /* Doesn't matter if we get preempted here, because any
59 pending event will get dealt with anyway. */
60 54
61 if (flags == 0) { 55 if (flags == 0) {
62 preempt_check_resched();
63 barrier(); /* unmask then check (avoid races) */ 56 barrier(); /* unmask then check (avoid races) */
64 if (unlikely(vcpu->evtchn_upcall_pending)) 57 if (unlikely(vcpu->evtchn_upcall_pending))
65 xen_force_evtchn_callback(); 58 xen_force_evtchn_callback();
66 } 59 preempt_enable();
60 } else
61 preempt_enable_no_resched();
67} 62}
68PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); 63PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
69 64
@@ -82,10 +77,12 @@ static void xen_irq_enable(void)
82{ 77{
83 struct vcpu_info *vcpu; 78 struct vcpu_info *vcpu;
84 79
85 /* We don't need to worry about being preempted here, since 80 /*
86 either a) interrupts are disabled, so no preemption, or b) 81 * We may be preempted as soon as vcpu->evtchn_upcall_mask is
87 the caller is confused and is trying to re-enable interrupts 82 * cleared, so disable preemption to ensure we check for
88 on an indeterminate processor. */ 83 * events on the VCPU we are still running on.
84 */
85 preempt_disable();
89 86
90 vcpu = this_cpu_read(xen_vcpu); 87 vcpu = this_cpu_read(xen_vcpu);
91 vcpu->evtchn_upcall_mask = 0; 88 vcpu->evtchn_upcall_mask = 0;
@@ -96,6 +93,8 @@ static void xen_irq_enable(void)
96 barrier(); /* unmask then check (avoid races) */ 93 barrier(); /* unmask then check (avoid races) */
97 if (unlikely(vcpu->evtchn_upcall_pending)) 94 if (unlikely(vcpu->evtchn_upcall_pending))
98 xen_force_evtchn_callback(); 95 xen_force_evtchn_callback();
96
97 preempt_enable();
99} 98}
100PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); 99PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
101 100
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 95fb2aa5927e..0d4ec35895d4 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -161,6 +161,7 @@
161#include <asm/xen/page.h> 161#include <asm/xen/page.h>
162#include <asm/xen/hypercall.h> 162#include <asm/xen/hypercall.h>
163#include <asm/xen/hypervisor.h> 163#include <asm/xen/hypervisor.h>
164#include <xen/balloon.h>
164#include <xen/grant_table.h> 165#include <xen/grant_table.h>
165 166
166#include "multicalls.h" 167#include "multicalls.h"
@@ -967,7 +968,10 @@ int m2p_remove_override(struct page *page,
967 if (kmap_op != NULL) { 968 if (kmap_op != NULL) {
968 if (!PageHighMem(page)) { 969 if (!PageHighMem(page)) {
969 struct multicall_space mcs; 970 struct multicall_space mcs;
970 struct gnttab_unmap_grant_ref *unmap_op; 971 struct gnttab_unmap_and_replace *unmap_op;
972 struct page *scratch_page = get_balloon_scratch_page();
973 unsigned long scratch_page_address = (unsigned long)
974 __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
971 975
972 /* 976 /*
973 * It might be that we queued all the m2p grant table 977 * It might be that we queued all the m2p grant table
@@ -990,21 +994,25 @@ int m2p_remove_override(struct page *page,
990 } 994 }
991 995
992 mcs = xen_mc_entry( 996 mcs = xen_mc_entry(
993 sizeof(struct gnttab_unmap_grant_ref)); 997 sizeof(struct gnttab_unmap_and_replace));
994 unmap_op = mcs.args; 998 unmap_op = mcs.args;
995 unmap_op->host_addr = kmap_op->host_addr; 999 unmap_op->host_addr = kmap_op->host_addr;
1000 unmap_op->new_addr = scratch_page_address;
996 unmap_op->handle = kmap_op->handle; 1001 unmap_op->handle = kmap_op->handle;
997 unmap_op->dev_bus_addr = 0;
998 1002
999 MULTI_grant_table_op(mcs.mc, 1003 MULTI_grant_table_op(mcs.mc,
1000 GNTTABOP_unmap_grant_ref, unmap_op, 1); 1004 GNTTABOP_unmap_and_replace, unmap_op, 1);
1001 1005
1002 xen_mc_issue(PARAVIRT_LAZY_MMU); 1006 xen_mc_issue(PARAVIRT_LAZY_MMU);
1003 1007
1004 set_pte_at(&init_mm, address, ptep, 1008 mcs = __xen_mc_entry(0);
1005 pfn_pte(pfn, PAGE_KERNEL)); 1009 MULTI_update_va_mapping(mcs.mc, scratch_page_address,
1006 __flush_tlb_single(address); 1010 pfn_pte(page_to_pfn(get_balloon_scratch_page()),
1011 PAGE_KERNEL_RO), 0);
1012 xen_mc_issue(PARAVIRT_LAZY_MMU);
1013
1007 kmap_op->host_addr = 0; 1014 kmap_op->host_addr = 0;
1015 put_balloon_scratch_page();
1008 } 1016 }
1009 } 1017 }
1010 1018
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 056d11faef21..09f3059cb00b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -33,6 +33,9 @@
33/* These are code, but not functions. Defined in entry.S */ 33/* These are code, but not functions. Defined in entry.S */
34extern const char xen_hypervisor_callback[]; 34extern const char xen_hypervisor_callback[];
35extern const char xen_failsafe_callback[]; 35extern const char xen_failsafe_callback[];
36#ifdef CONFIG_X86_64
37extern const char nmi[];
38#endif
36extern void xen_sysenter_target(void); 39extern void xen_sysenter_target(void);
37extern void xen_syscall_target(void); 40extern void xen_syscall_target(void);
38extern void xen_syscall32_target(void); 41extern void xen_syscall32_target(void);
@@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk(
215 unsigned long pfn; 218 unsigned long pfn;
216 219
217 /* 220 /*
218 * If the PFNs are currently mapped, the VA mapping also needs 221 * If the PFNs are currently mapped, clear the mappings
219 * to be updated to be 1:1. 222 * (except for the ISA region which must be 1:1 mapped) to
223 * release the refcounts (in Xen) on the original frames.
220 */ 224 */
221 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) 225 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
226 pte_t pte = __pte_ma(0);
227
228 if (pfn < PFN_UP(ISA_END_ADDRESS))
229 pte = mfn_pte(pfn, PAGE_KERNEL_IO);
230
222 (void)HYPERVISOR_update_va_mapping( 231 (void)HYPERVISOR_update_va_mapping(
223 (unsigned long)__va(pfn << PAGE_SHIFT), 232 (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
224 mfn_pte(pfn, PAGE_KERNEL_IO), 0); 233 }
225 234
226 if (start_pfn < nr_pages) 235 if (start_pfn < nr_pages)
227 *released += xen_release_chunk( 236 *released += xen_release_chunk(
@@ -313,6 +322,17 @@ static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
313 e820_add_region(start, end - start, type); 322 e820_add_region(start, end - start, type);
314} 323}
315 324
325void xen_ignore_unusable(struct e820entry *list, size_t map_size)
326{
327 struct e820entry *entry;
328 unsigned int i;
329
330 for (i = 0, entry = list; i < map_size; i++, entry++) {
331 if (entry->type == E820_UNUSABLE)
332 entry->type = E820_RAM;
333 }
334}
335
316/** 336/**
317 * machine_specific_memory_setup - Hook for machine specific memory setup. 337 * machine_specific_memory_setup - Hook for machine specific memory setup.
318 **/ 338 **/
@@ -353,6 +373,17 @@ char * __init xen_memory_setup(void)
353 } 373 }
354 BUG_ON(rc); 374 BUG_ON(rc);
355 375
376 /*
377 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
378 * regions, so if we're using the machine memory map leave the
379 * region as RAM as it is in the pseudo-physical map.
380 *
381 * UNUSABLE regions in domUs are not handled and will need
382 * a patch in the future.
383 */
384 if (xen_initial_domain())
385 xen_ignore_unusable(map, memmap.nr_entries);
386
356 /* Make sure the Xen-supplied memory map is well-ordered. */ 387 /* Make sure the Xen-supplied memory map is well-ordered. */
357 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); 388 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
358 389
@@ -525,7 +556,13 @@ void xen_enable_syscall(void)
525 } 556 }
526#endif /* CONFIG_X86_64 */ 557#endif /* CONFIG_X86_64 */
527} 558}
528 559void __cpuinit xen_enable_nmi(void)
560{
561#ifdef CONFIG_X86_64
562 if (register_callback(CALLBACKTYPE_nmi, nmi))
563 BUG();
564#endif
565}
529void __init xen_arch_setup(void) 566void __init xen_arch_setup(void)
530{ 567{
531 xen_panic_handler_init(); 568 xen_panic_handler_init();
@@ -543,7 +580,7 @@ void __init xen_arch_setup(void)
543 580
544 xen_enable_sysenter(); 581 xen_enable_sysenter();
545 xen_enable_syscall(); 582 xen_enable_syscall();
546 583 xen_enable_nmi();
547#ifdef CONFIG_ACPI 584#ifdef CONFIG_ACPI
548 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 585 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
549 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 586 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index ca92754eb846..9235842cd76a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -279,6 +279,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
279 279
280 xen_filter_cpu_maps(); 280 xen_filter_cpu_maps();
281 xen_setup_vcpu_info_placement(); 281 xen_setup_vcpu_info_placement();
282 xen_init_spinlocks();
282} 283}
283 284
284static void __init xen_smp_prepare_cpus(unsigned int max_cpus) 285static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
@@ -572,6 +573,12 @@ static inline int xen_map_vector(int vector)
572 case IRQ_WORK_VECTOR: 573 case IRQ_WORK_VECTOR:
573 xen_vector = XEN_IRQ_WORK_VECTOR; 574 xen_vector = XEN_IRQ_WORK_VECTOR;
574 break; 575 break;
576#ifdef CONFIG_X86_64
577 case NMI_VECTOR:
578 case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */
579 xen_vector = XEN_NMI_VECTOR;
580 break;
581#endif
575 default: 582 default:
576 xen_vector = -1; 583 xen_vector = -1;
577 printk(KERN_ERR "xen: vector 0x%x is not implemented\n", 584 printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
@@ -680,7 +687,6 @@ void __init xen_smp_init(void)
680{ 687{
681 smp_ops = xen_smp_ops; 688 smp_ops = xen_smp_ops;
682 xen_fill_possible_map(); 689 xen_fill_possible_map();
683 xen_init_spinlocks();
684} 690}
685 691
686static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) 692static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
@@ -694,8 +700,15 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
694static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) 700static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
695{ 701{
696 int rc; 702 int rc;
697 rc = native_cpu_up(cpu, tidle); 703 /*
698 WARN_ON (xen_smp_intr_init(cpu)); 704 * xen_smp_intr_init() needs to run before native_cpu_up()
705 * so that IPI vectors are set up on the booting CPU before
706 * it is marked online in native_cpu_up().
707 */
708 rc = xen_smp_intr_init(cpu);
709 WARN_ON(rc);
710 if (!rc)
711 rc = native_cpu_up(cpu, tidle);
699 return rc; 712 return rc;
700} 713}
701 714
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index cf3caee356b3..0438b9324a72 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -17,45 +17,44 @@
17#include "xen-ops.h" 17#include "xen-ops.h"
18#include "debugfs.h" 18#include "debugfs.h"
19 19
20#ifdef CONFIG_XEN_DEBUG_FS 20enum xen_contention_stat {
21static struct xen_spinlock_stats 21 TAKEN_SLOW,
22{ 22 TAKEN_SLOW_PICKUP,
23 u64 taken; 23 TAKEN_SLOW_SPURIOUS,
24 u32 taken_slow; 24 RELEASED_SLOW,
25 u32 taken_slow_nested; 25 RELEASED_SLOW_KICKED,
26 u32 taken_slow_pickup; 26 NR_CONTENTION_STATS
27 u32 taken_slow_spurious; 27};
28 u32 taken_slow_irqenable;
29 28
30 u64 released;
31 u32 released_slow;
32 u32 released_slow_kicked;
33 29
30#ifdef CONFIG_XEN_DEBUG_FS
34#define HISTO_BUCKETS 30 31#define HISTO_BUCKETS 30
35 u32 histo_spin_total[HISTO_BUCKETS+1]; 32static struct xen_spinlock_stats
36 u32 histo_spin_spinning[HISTO_BUCKETS+1]; 33{
34 u32 contention_stats[NR_CONTENTION_STATS];
37 u32 histo_spin_blocked[HISTO_BUCKETS+1]; 35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
38
39 u64 time_total;
40 u64 time_spinning;
41 u64 time_blocked; 36 u64 time_blocked;
42} spinlock_stats; 37} spinlock_stats;
43 38
44static u8 zero_stats; 39static u8 zero_stats;
45 40
46static unsigned lock_timeout = 1 << 10;
47#define TIMEOUT lock_timeout
48
49static inline void check_zero(void) 41static inline void check_zero(void)
50{ 42{
51 if (unlikely(zero_stats)) { 43 u8 ret;
52 memset(&spinlock_stats, 0, sizeof(spinlock_stats)); 44 u8 old = ACCESS_ONCE(zero_stats);
53 zero_stats = 0; 45 if (unlikely(old)) {
46 ret = cmpxchg(&zero_stats, old, 0);
47 /* This ensures only one fellow resets the stat */
48 if (ret == old)
49 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
54 } 50 }
55} 51}
56 52
57#define ADD_STATS(elem, val) \ 53static inline void add_stats(enum xen_contention_stat var, u32 val)
58 do { check_zero(); spinlock_stats.elem += (val); } while(0) 54{
55 check_zero();
56 spinlock_stats.contention_stats[var] += val;
57}
59 58
60static inline u64 spin_time_start(void) 59static inline u64 spin_time_start(void)
61{ 60{
@@ -74,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array)
74 array[HISTO_BUCKETS]++; 73 array[HISTO_BUCKETS]++;
75} 74}
76 75
77static inline void spin_time_accum_spinning(u64 start)
78{
79 u32 delta = xen_clocksource_read() - start;
80
81 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
82 spinlock_stats.time_spinning += delta;
83}
84
85static inline void spin_time_accum_total(u64 start)
86{
87 u32 delta = xen_clocksource_read() - start;
88
89 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
90 spinlock_stats.time_total += delta;
91}
92
93static inline void spin_time_accum_blocked(u64 start) 76static inline void spin_time_accum_blocked(u64 start)
94{ 77{
95 u32 delta = xen_clocksource_read() - start; 78 u32 delta = xen_clocksource_read() - start;
@@ -99,19 +82,15 @@ static inline void spin_time_accum_blocked(u64 start)
99} 82}
100#else /* !CONFIG_XEN_DEBUG_FS */ 83#else /* !CONFIG_XEN_DEBUG_FS */
101#define TIMEOUT (1 << 10) 84#define TIMEOUT (1 << 10)
102#define ADD_STATS(elem, val) do { (void)(val); } while(0) 85static inline void add_stats(enum xen_contention_stat var, u32 val)
86{
87}
103 88
104static inline u64 spin_time_start(void) 89static inline u64 spin_time_start(void)
105{ 90{
106 return 0; 91 return 0;
107} 92}
108 93
109static inline void spin_time_accum_total(u64 start)
110{
111}
112static inline void spin_time_accum_spinning(u64 start)
113{
114}
115static inline void spin_time_accum_blocked(u64 start) 94static inline void spin_time_accum_blocked(u64 start)
116{ 95{
117} 96}
@@ -134,227 +113,123 @@ typedef u16 xen_spinners_t;
134 asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); 113 asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
135#endif 114#endif
136 115
137struct xen_spinlock { 116struct xen_lock_waiting {
138 unsigned char lock; /* 0 -> free; 1 -> locked */ 117 struct arch_spinlock *lock;
139 xen_spinners_t spinners; /* count of waiting cpus */ 118 __ticket_t want;
140}; 119};
141 120
142static int xen_spin_is_locked(struct arch_spinlock *lock)
143{
144 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
145
146 return xl->lock != 0;
147}
148
149static int xen_spin_is_contended(struct arch_spinlock *lock)
150{
151 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
152
153 /* Not strictly true; this is only the count of contended
154 lock-takers entering the slow path. */
155 return xl->spinners != 0;
156}
157
158static int xen_spin_trylock(struct arch_spinlock *lock)
159{
160 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
161 u8 old = 1;
162
163 asm("xchgb %b0,%1"
164 : "+q" (old), "+m" (xl->lock) : : "memory");
165
166 return old == 0;
167}
168
169static DEFINE_PER_CPU(char *, irq_name);
170static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; 121static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
171static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); 122static DEFINE_PER_CPU(char *, irq_name);
172 123static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
173/* 124static cpumask_t waiting_cpus;
174 * Mark a cpu as interested in a lock. Returns the CPU's previous
175 * lock of interest, in case we got preempted by an interrupt.
176 */
177static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
178{
179 struct xen_spinlock *prev;
180
181 prev = __this_cpu_read(lock_spinners);
182 __this_cpu_write(lock_spinners, xl);
183
184 wmb(); /* set lock of interest before count */
185
186 inc_spinners(xl);
187
188 return prev;
189}
190
191/*
192 * Mark a cpu as no longer interested in a lock. Restores previous
193 * lock of interest (NULL for none).
194 */
195static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
196{
197 dec_spinners(xl);
198 wmb(); /* decrement count before restoring lock */
199 __this_cpu_write(lock_spinners, prev);
200}
201 125
202static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) 126static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
203{ 127{
204 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
205 struct xen_spinlock *prev;
206 int irq = __this_cpu_read(lock_kicker_irq); 128 int irq = __this_cpu_read(lock_kicker_irq);
207 int ret; 129 struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
130 int cpu = smp_processor_id();
208 u64 start; 131 u64 start;
132 unsigned long flags;
209 133
210 /* If kicker interrupts not initialized yet, just spin */ 134 /* If kicker interrupts not initialized yet, just spin */
211 if (irq == -1) 135 if (irq == -1)
212 return 0; 136 return;
213 137
214 start = spin_time_start(); 138 start = spin_time_start();
215 139
216 /* announce we're spinning */ 140 /*
217 prev = spinning_lock(xl); 141 * Make sure an interrupt handler can't upset things in a
142 * partially setup state.
143 */
144 local_irq_save(flags);
145 /*
146 * We don't really care if we're overwriting some other
147 * (lock,want) pair, as that would mean that we're currently
148 * in an interrupt context, and the outer context had
149 * interrupts enabled. That has already kicked the VCPU out
150 * of xen_poll_irq(), so it will just return spuriously and
151 * retry with newly setup (lock,want).
152 *
153 * The ordering protocol on this is that the "lock" pointer
154 * may only be set non-NULL if the "want" ticket is correct.
155 * If we're updating "want", we must first clear "lock".
156 */
157 w->lock = NULL;
158 smp_wmb();
159 w->want = want;
160 smp_wmb();
161 w->lock = lock;
218 162
219 ADD_STATS(taken_slow, 1); 163 /* This uses set_bit, which atomic and therefore a barrier */
220 ADD_STATS(taken_slow_nested, prev != NULL); 164 cpumask_set_cpu(cpu, &waiting_cpus);
165 add_stats(TAKEN_SLOW, 1);
221 166
222 do { 167 /* clear pending */
223 unsigned long flags; 168 xen_clear_irq_pending(irq);
224 169
225 /* clear pending */ 170 /* Only check lock once pending cleared */
226 xen_clear_irq_pending(irq); 171 barrier();
227 172
228 /* check again make sure it didn't become free while 173 /*
229 we weren't looking */ 174 * Mark entry to slowpath before doing the pickup test to make
230 ret = xen_spin_trylock(lock); 175 * sure we don't deadlock with an unlocker.
231 if (ret) { 176 */
232 ADD_STATS(taken_slow_pickup, 1); 177 __ticket_enter_slowpath(lock);
233 178
234 /* 179 /*
235 * If we interrupted another spinlock while it 180 * check again make sure it didn't become free while
236 * was blocking, make sure it doesn't block 181 * we weren't looking
237 * without rechecking the lock. 182 */
238 */ 183 if (ACCESS_ONCE(lock->tickets.head) == want) {
239 if (prev != NULL) 184 add_stats(TAKEN_SLOW_PICKUP, 1);
240 xen_set_irq_pending(irq); 185 goto out;
241 goto out; 186 }
242 }
243 187
244 flags = arch_local_save_flags(); 188 /* Allow interrupts while blocked */
245 if (irq_enable) { 189 local_irq_restore(flags);
246 ADD_STATS(taken_slow_irqenable, 1);
247 raw_local_irq_enable();
248 }
249 190
250 /* 191 /*
251 * Block until irq becomes pending. If we're 192 * If an interrupt happens here, it will leave the wakeup irq
252 * interrupted at this point (after the trylock but 193 * pending, which will cause xen_poll_irq() to return
253 * before entering the block), then the nested lock 194 * immediately.
254 * handler guarantees that the irq will be left 195 */
255 * pending if there's any chance the lock became free;
256 * xen_poll_irq() returns immediately if the irq is
257 * pending.
258 */
259 xen_poll_irq(irq);
260 196
261 raw_local_irq_restore(flags); 197 /* Block until irq becomes pending (or perhaps a spurious wakeup) */
198 xen_poll_irq(irq);
199 add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
262 200
263 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); 201 local_irq_save(flags);
264 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
265 202
266 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 203 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
267
268out: 204out:
269 unspinning_lock(xl, prev); 205 cpumask_clear_cpu(cpu, &waiting_cpus);
270 spin_time_accum_blocked(start); 206 w->lock = NULL;
271
272 return ret;
273}
274
275static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
276{
277 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
278 unsigned timeout;
279 u8 oldval;
280 u64 start_spin;
281
282 ADD_STATS(taken, 1);
283
284 start_spin = spin_time_start();
285
286 do {
287 u64 start_spin_fast = spin_time_start();
288
289 timeout = TIMEOUT;
290
291 asm("1: xchgb %1,%0\n"
292 " testb %1,%1\n"
293 " jz 3f\n"
294 "2: rep;nop\n"
295 " cmpb $0,%0\n"
296 " je 1b\n"
297 " dec %2\n"
298 " jnz 2b\n"
299 "3:\n"
300 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
301 : "1" (1)
302 : "memory");
303 207
304 spin_time_accum_spinning(start_spin_fast); 208 local_irq_restore(flags);
305 209
306 } while (unlikely(oldval != 0 && 210 spin_time_accum_blocked(start);
307 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
308
309 spin_time_accum_total(start_spin);
310}
311
312static void xen_spin_lock(struct arch_spinlock *lock)
313{
314 __xen_spin_lock(lock, false);
315}
316
317static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
318{
319 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
320} 211}
212PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
321 213
322static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) 214static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
323{ 215{
324 int cpu; 216 int cpu;
325 217
326 ADD_STATS(released_slow, 1); 218 add_stats(RELEASED_SLOW, 1);
219
220 for_each_cpu(cpu, &waiting_cpus) {
221 const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
327 222
328 for_each_online_cpu(cpu) { 223 /* Make sure we read lock before want */
329 /* XXX should mix up next cpu selection */ 224 if (ACCESS_ONCE(w->lock) == lock &&
330 if (per_cpu(lock_spinners, cpu) == xl) { 225 ACCESS_ONCE(w->want) == next) {
331 ADD_STATS(released_slow_kicked, 1); 226 add_stats(RELEASED_SLOW_KICKED, 1);
332 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); 227 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
228 break;
333 } 229 }
334 } 230 }
335} 231}
336 232
337static void xen_spin_unlock(struct arch_spinlock *lock)
338{
339 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
340
341 ADD_STATS(released, 1);
342
343 smp_wmb(); /* make sure no writes get moved after unlock */
344 xl->lock = 0; /* release lock */
345
346 /*
347 * Make sure unlock happens before checking for waiting
348 * spinners. We need a strong barrier to enforce the
349 * write-read ordering to different memory locations, as the
350 * CPU makes no implied guarantees about their ordering.
351 */
352 mb();
353
354 if (unlikely(xl->spinners))
355 xen_spin_unlock_slow(xl);
356}
357
358static irqreturn_t dummy_handler(int irq, void *dev_id) 233static irqreturn_t dummy_handler(int irq, void *dev_id)
359{ 234{
360 BUG(); 235 BUG();
@@ -408,6 +283,8 @@ void xen_uninit_lock_cpu(int cpu)
408 per_cpu(irq_name, cpu) = NULL; 283 per_cpu(irq_name, cpu) = NULL;
409} 284}
410 285
286static bool xen_pvspin __initdata = true;
287
411void __init xen_init_spinlocks(void) 288void __init xen_init_spinlocks(void)
412{ 289{
413 /* 290 /*
@@ -417,15 +294,23 @@ void __init xen_init_spinlocks(void)
417 if (xen_hvm_domain()) 294 if (xen_hvm_domain())
418 return; 295 return;
419 296
420 BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); 297 if (!xen_pvspin) {
298 printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
299 return;
300 }
421 301
422 pv_lock_ops.spin_is_locked = xen_spin_is_locked; 302 static_key_slow_inc(&paravirt_ticketlocks_enabled);
423 pv_lock_ops.spin_is_contended = xen_spin_is_contended; 303
424 pv_lock_ops.spin_lock = xen_spin_lock; 304 pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
425 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; 305 pv_lock_ops.unlock_kick = xen_unlock_kick;
426 pv_lock_ops.spin_trylock = xen_spin_trylock; 306}
427 pv_lock_ops.spin_unlock = xen_spin_unlock; 307
308static __init int xen_parse_nopvspin(char *arg)
309{
310 xen_pvspin = false;
311 return 0;
428} 312}
313early_param("xen_nopvspin", xen_parse_nopvspin);
429 314
430#ifdef CONFIG_XEN_DEBUG_FS 315#ifdef CONFIG_XEN_DEBUG_FS
431 316
@@ -442,37 +327,21 @@ static int __init xen_spinlock_debugfs(void)
442 327
443 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); 328 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
444 329
445 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
446
447 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
448 debugfs_create_u32("taken_slow", 0444, d_spin_debug, 330 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
449 &spinlock_stats.taken_slow); 331 &spinlock_stats.contention_stats[TAKEN_SLOW]);
450 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
451 &spinlock_stats.taken_slow_nested);
452 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, 332 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
453 &spinlock_stats.taken_slow_pickup); 333 &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
454 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, 334 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
455 &spinlock_stats.taken_slow_spurious); 335 &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
456 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
457 &spinlock_stats.taken_slow_irqenable);
458 336
459 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
460 debugfs_create_u32("released_slow", 0444, d_spin_debug, 337 debugfs_create_u32("released_slow", 0444, d_spin_debug,
461 &spinlock_stats.released_slow); 338 &spinlock_stats.contention_stats[RELEASED_SLOW]);
462 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, 339 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
463 &spinlock_stats.released_slow_kicked); 340 &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
464 341
465 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
466 &spinlock_stats.time_spinning);
467 debugfs_create_u64("time_blocked", 0444, d_spin_debug, 342 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
468 &spinlock_stats.time_blocked); 343 &spinlock_stats.time_blocked);
469 debugfs_create_u64("time_total", 0444, d_spin_debug,
470 &spinlock_stats.time_total);
471 344
472 debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
473 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
474 debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
475 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
476 debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, 345 debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
477 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); 346 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
478 347
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 86782c5d7e2a..95f8c6142328 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -105,9 +105,9 @@ static inline void __init xen_init_apic(void)
105/* Declare an asm function, along with symbols needed to make it 105/* Declare an asm function, along with symbols needed to make it
106 inlineable */ 106 inlineable */
107#define DECL_ASM(ret, name, ...) \ 107#define DECL_ASM(ret, name, ...) \
108 ret name(__VA_ARGS__); \ 108 __visible ret name(__VA_ARGS__); \
109 extern char name##_end[]; \ 109 extern char name##_end[] __visible; \
110 extern char name##_reloc[] \ 110 extern char name##_reloc[] __visible
111 111
112DECL_ASM(void, xen_irq_enable_direct, void); 112DECL_ASM(void, xen_irq_enable_direct, void);
113DECL_ASM(void, xen_irq_disable_direct, void); 113DECL_ASM(void, xen_irq_disable_direct, void);
@@ -115,11 +115,11 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void);
115DECL_ASM(void, xen_restore_fl_direct, unsigned long); 115DECL_ASM(void, xen_restore_fl_direct, unsigned long);
116 116
117/* These are not functions, and cannot be called normally */ 117/* These are not functions, and cannot be called normally */
118void xen_iret(void); 118__visible void xen_iret(void);
119void xen_sysexit(void); 119__visible void xen_sysexit(void);
120void xen_sysret32(void); 120__visible void xen_sysret32(void);
121void xen_sysret64(void); 121__visible void xen_sysret64(void);
122void xen_adjust_exception_frame(void); 122__visible void xen_adjust_exception_frame(void);
123 123
124extern int xen_panic_handler_init(void); 124extern int xen_panic_handler_init(void);
125 125