aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2012-01-19 15:56:50 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2012-01-19 15:56:50 -0500
commit282f445a779ed76fca9884fe377bf56a3088b208 (patch)
treed9abcf526baee0100672851e0a8894c19e762a39 /arch/x86
parent68f30fbee19cc67849b9fa8e153ede70758afe81 (diff)
parent90a4c0f51e8e44111a926be6f4c87af3938a79c3 (diff)
Merge remote-tracking branch 'linus/master' into x86/urgent
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig48
-rw-r--r--arch/x86/Kconfig.cpu6
-rw-r--r--arch/x86/Kconfig.debug25
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/boot/compressed/Makefile10
-rw-r--r--arch/x86/boot/compressed/eboot.c1022
-rw-r--r--arch/x86/boot/compressed/eboot.h61
-rw-r--r--arch/x86/boot/compressed/efi_stub_32.S86
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S1
-rw-r--r--arch/x86/boot/compressed/head_32.S22
-rw-r--r--arch/x86/boot/compressed/head_64.S20
-rw-r--r--arch/x86/boot/compressed/string.c9
-rw-r--r--arch/x86/boot/header.S158
-rw-r--r--arch/x86/boot/string.c35
-rw-r--r--arch/x86/boot/tools/build.c39
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S638
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S761
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c1070
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c218
-rw-r--r--arch/x86/ia32/Makefile1
-rw-r--r--arch/x86/ia32/ia32entry.S373
-rw-r--r--arch/x86/ia32/nosyscall.c7
-rw-r--r--arch/x86/ia32/syscall_ia32.c25
-rw-r--r--arch/x86/include/asm/Kbuild5
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/bootparam.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h3
-rw-r--r--arch/x86/include/asm/debugreg.h22
-rw-r--r--arch/x86/include/asm/desc.h12
-rw-r--r--arch/x86/include/asm/efi.h4
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/ia32_unistd.h13
-rw-r--r--arch/x86/include/asm/init.h2
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h90
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/mrst.h2
-rw-r--r--arch/x86/include/asm/pci.h9
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/percpu.h28
-rw-r--r--arch/x86/include/asm/serpent.h63
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/smp.h6
-rw-r--r--arch/x86/include/asm/syscall.h1
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/unistd.h54
-rw-r--r--arch/x86/include/asm/unistd_32.h401
-rw-r--r--arch/x86/include/asm/unistd_64.h732
-rw-r--r--arch/x86/include/asm/x86_init.h1
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/amd_nb.c31
-rw-r--r--arch/x86/kernel/apm_32.c16
-rw-r--r--arch/x86/kernel/asm-offsets.c2
-rw-r--r--arch/x86/kernel/asm-offsets_32.c8
-rw-r--r--arch/x86/kernel/asm-offsets_64.c19
-rw-r--r--arch/x86/kernel/cpu/common.c24
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c19
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c63
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/e820.c63
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/entry_32.S47
-rw-r--r--arch/x86/kernel/entry_64.S232
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c1
-rw-r--r--arch/x86/kernel/irq_32.c5
-rw-r--r--arch/x86/kernel/irq_64.c35
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kvm.c181
-rw-r--r--arch/x86/kernel/microcode_core.c64
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c102
-rw-r--r--arch/x86/kernel/nmi_selftest.c180
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/ptrace.c25
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c72
-rw-r--r--arch/x86/kernel/smpboot.c17
-rw-r--r--arch/x86/kernel/syscall_32.c25
-rw-r--r--arch/x86/kernel/syscall_64.c20
-rw-r--r--arch/x86/kernel/syscall_table_32.S350
-rw-r--r--arch/x86/kernel/traps.c20
-rw-r--r--arch/x86/kernel/tsc.c20
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c670
-rw-r--r--arch/x86/kvm/cpuid.h46
-rw-r--r--arch/x86/kvm/emulate.c436
-rw-r--r--arch/x86/kvm/i8254.c10
-rw-r--r--arch/x86/kvm/i8259.c24
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c547
-rw-r--r--arch/x86/kvm/mmu_audit.c29
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h86
-rw-r--r--arch/x86/kvm/pmu.c533
-rw-r--r--arch/x86/kvm/svm.c15
-rw-r--r--arch/x86/kvm/timer.c26
-rw-r--r--arch/x86/kvm/vmx.c63
-rw-r--r--arch/x86/kvm/x86.c1005
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--arch/x86/lguest/boot.c21
-rw-r--r--arch/x86/mm/init.c23
-rw-r--r--arch/x86/mm/init_32.c29
-rw-r--r--arch/x86/mm/init_64.c11
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/numa.c12
-rw-r--r--arch/x86/mm/pageattr.c6
-rw-r--r--arch/x86/mm/srat.c4
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/acpi.c75
-rw-r--r--arch/x86/pci/amd_bus.c43
-rw-r--r--arch/x86/pci/broadcom_bus.c62
-rw-r--r--arch/x86/pci/bus_numa.c31
-rw-r--r--arch/x86/pci/common.c19
-rw-r--r--arch/x86/pci/i386.c20
-rw-r--r--arch/x86/pci/legacy.c3
-rw-r--r--arch/x86/pci/numaq_32.c2
-rw-r--r--arch/x86/pci/pcbios.c2
-rw-r--r--arch/x86/platform/geode/alix.c2
-rw-r--r--arch/x86/platform/iris/iris.c2
-rw-r--r--arch/x86/platform/mrst/Makefile6
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c16
-rw-r--r--arch/x86/platform/mrst/mrst.c4
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c2
-rw-r--r--arch/x86/syscalls/Makefile43
-rw-r--r--arch/x86/syscalls/syscall_32.tbl357
-rw-r--r--arch/x86/syscalls/syscall_64.tbl320
-rw-r--r--arch/x86/syscalls/syscallhdr.sh27
-rw-r--r--arch/x86/syscalls/syscalltbl.sh15
-rw-r--r--arch/x86/um/Kconfig8
-rw-r--r--arch/x86/um/Makefile3
-rw-r--r--arch/x86/um/shared/sysdep/ptrace.h10
-rw-r--r--arch/x86/um/sys_call_table_32.S26
-rw-r--r--arch/x86/um/sys_call_table_32.c55
-rw-r--r--arch/x86/um/sys_call_table_64.c33
-rw-r--r--arch/x86/um/user-offsets.c15
-rw-r--r--arch/x86/xen/Kconfig4
-rw-r--r--arch/x86/xen/debugfs.c2
-rw-r--r--arch/x86/xen/debugfs.h2
-rw-r--r--arch/x86/xen/grant-table.c44
-rw-r--r--arch/x86/xen/mmu.c2
153 files changed, 8721 insertions, 4145 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db190faffba1..864cc6e6ac8e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,8 +60,12 @@ config X86
60 select PERF_EVENTS 60 select PERF_EVENTS
61 select HAVE_PERF_EVENTS_NMI 61 select HAVE_PERF_EVENTS_NMI
62 select ANON_INODES 62 select ANON_INODES
63 select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
64 select HAVE_CMPXCHG_LOCAL if !M386
65 select HAVE_CMPXCHG_DOUBLE
63 select HAVE_ARCH_KMEMCHECK 66 select HAVE_ARCH_KMEMCHECK
64 select HAVE_USER_RETURN_NOTIFIER 67 select HAVE_USER_RETURN_NOTIFIER
68 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
65 select HAVE_ARCH_JUMP_LABEL 69 select HAVE_ARCH_JUMP_LABEL
66 select HAVE_TEXT_POKE_SMP 70 select HAVE_TEXT_POKE_SMP
67 select HAVE_GENERIC_HARDIRQS 71 select HAVE_GENERIC_HARDIRQS
@@ -77,6 +81,7 @@ config X86
77 select HAVE_BPF_JIT if (X86_64 && NET) 81 select HAVE_BPF_JIT if (X86_64 && NET)
78 select CLKEVT_I8253 82 select CLKEVT_I8253
79 select ARCH_HAVE_NMI_SAFE_CMPXCHG 83 select ARCH_HAVE_NMI_SAFE_CMPXCHG
84 select GENERIC_IOMAP
80 85
81config INSTRUCTION_DECODER 86config INSTRUCTION_DECODER
82 def_bool (KPROBES || PERF_EVENTS) 87 def_bool (KPROBES || PERF_EVENTS)
@@ -132,9 +137,6 @@ config NEED_SG_DMA_LENGTH
132config GENERIC_ISA_DMA 137config GENERIC_ISA_DMA
133 def_bool ISA_DMA_API 138 def_bool ISA_DMA_API
134 139
135config GENERIC_IOMAP
136 def_bool y
137
138config GENERIC_BUG 140config GENERIC_BUG
139 def_bool y 141 def_bool y
140 depends on BUG 142 depends on BUG
@@ -421,12 +423,14 @@ config X86_MRST
421 depends on PCI 423 depends on PCI
422 depends on PCI_GOANY 424 depends on PCI_GOANY
423 depends on X86_IO_APIC 425 depends on X86_IO_APIC
426 select X86_INTEL_MID
427 select SFI
428 select DW_APB_TIMER
424 select APB_TIMER 429 select APB_TIMER
425 select I2C 430 select I2C
426 select SPI 431 select SPI
427 select INTEL_SCU_IPC 432 select INTEL_SCU_IPC
428 select X86_PLATFORM_DEVICES 433 select X86_PLATFORM_DEVICES
429 select X86_INTEL_MID
430 ---help--- 434 ---help---
431 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin 435 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
432 Internet Device(MID) platform. Moorestown consists of two chips: 436 Internet Device(MID) platform. Moorestown consists of two chips:
@@ -435,6 +439,26 @@ config X86_MRST
435 nor standard legacy replacement devices/features. e.g. Moorestown does 439 nor standard legacy replacement devices/features. e.g. Moorestown does
436 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. 440 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
437 441
442config X86_MDFLD
443 bool "Medfield MID platform"
444 depends on PCI
445 depends on PCI_GOANY
446 depends on X86_IO_APIC
447 select X86_INTEL_MID
448 select SFI
449 select DW_APB_TIMER
450 select APB_TIMER
451 select I2C
452 select SPI
453 select INTEL_SCU_IPC
454 select X86_PLATFORM_DEVICES
455 ---help---
456 Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
457 Internet Device(MID) platform.
458 Unlike standard x86 PCs, Medfield does not have many legacy devices
459 nor standard legacy replacement devices/features. e.g. Medfield does
460 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
461
438endif 462endif
439 463
440config X86_RDC321X 464config X86_RDC321X
@@ -632,7 +656,7 @@ config X86_SUMMIT_NUMA
632 656
633config X86_CYCLONE_TIMER 657config X86_CYCLONE_TIMER
634 def_bool y 658 def_bool y
635 depends on X86_32_NON_STANDARD 659 depends on X86_SUMMIT
636 660
637source "arch/x86/Kconfig.cpu" 661source "arch/x86/Kconfig.cpu"
638 662
@@ -660,9 +684,10 @@ config HPET_EMULATE_RTC
660 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) 684 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
661 685
662config APB_TIMER 686config APB_TIMER
663 def_bool y if MRST 687 def_bool y if X86_INTEL_MID
664 prompt "Langwell APB Timer Support" if X86_MRST 688 prompt "Intel MID APB Timer Support" if X86_INTEL_MID
665 select DW_APB_TIMER 689 select DW_APB_TIMER
690 depends on X86_INTEL_MID && SFI
666 help 691 help
667 APB timer is the replacement for 8254, HPET on X86 MID platforms. 692 APB timer is the replacement for 8254, HPET on X86 MID platforms.
668 The APBT provides a stable time base on SMP 693 The APBT provides a stable time base on SMP
@@ -1490,6 +1515,13 @@ config EFI
1490 resultant kernel should continue to boot on existing non-EFI 1515 resultant kernel should continue to boot on existing non-EFI
1491 platforms. 1516 platforms.
1492 1517
1518config EFI_STUB
1519 bool "EFI stub support"
1520 depends on EFI
1521 ---help---
1522 This kernel feature allows a bzImage to be loaded directly
1523 by EFI firmware without the use of a bootloader.
1524
1493config SECCOMP 1525config SECCOMP
1494 def_bool y 1526 def_bool y
1495 prompt "Enable seccomp to safely compute untrusted bytecode" 1527 prompt "Enable seccomp to safely compute untrusted bytecode"
@@ -1742,7 +1774,7 @@ source "drivers/sfi/Kconfig"
1742 1774
1743config X86_APM_BOOT 1775config X86_APM_BOOT
1744 def_bool y 1776 def_bool y
1745 depends on APM || APM_MODULE 1777 depends on APM
1746 1778
1747menuconfig APM 1779menuconfig APM
1748 tristate "APM (Advanced Power Management) BIOS support" 1780 tristate "APM (Advanced Power Management) BIOS support"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e3ca7e0d858c..3c57033e2211 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,12 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
309config X86_CMPXCHG 309config X86_CMPXCHG
310 def_bool X86_64 || (X86_32 && !M386) 310 def_bool X86_64 || (X86_32 && !M386)
311 311
312config CMPXCHG_LOCAL
313 def_bool X86_64 || (X86_32 && !M386)
314
315config CMPXCHG_DOUBLE
316 def_bool y
317
318config X86_L1_CACHE_SHIFT 312config X86_L1_CACHE_SHIFT
319 int 313 int
320 default "7" if MPENTIUM4 || MPSC 314 default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e1793272..e46c2147397f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,9 +43,9 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_MRST 46config EARLY_PRINTK_INTEL_MID
47 bool "Early printk for MRST platform support" 47 bool "Early printk for Intel MID platform support"
48 depends on EARLY_PRINTK && X86_MRST 48 depends on EARLY_PRINTK && X86_INTEL_MID
49 49
50config EARLY_PRINTK_DBGP 50config EARLY_PRINTK_DBGP
51 bool "Early printk via EHCI debug port" 51 bool "Early printk via EHCI debug port"
@@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW
63 bool "Check for stack overflows" 63 bool "Check for stack overflows"
64 depends on DEBUG_KERNEL 64 depends on DEBUG_KERNEL
65 ---help--- 65 ---help---
66 This option will cause messages to be printed if free stack space 66 Say Y here if you want to check the overflows of kernel, IRQ
67 drops below a certain limit. 67 and exception stacks. This option will cause messages of the
68 stacks in detail when free stack space drops below a certain
69 limit.
70 If in doubt, say "N".
68 71
69config X86_PTDUMP 72config X86_PTDUMP
70 bool "Export kernel pagetable layout to userspace via debugfs" 73 bool "Export kernel pagetable layout to userspace via debugfs"
@@ -284,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS
284 287
285 If unsure, or if you run an older (pre 4.4) gcc, say N. 288 If unsure, or if you run an older (pre 4.4) gcc, say N.
286 289
290config DEBUG_NMI_SELFTEST
291 bool "NMI Selftest"
292 depends on DEBUG_KERNEL && X86_LOCAL_APIC
293 ---help---
294 Enabling this option turns on a quick NMI selftest to verify
295 that the NMI behaves correctly.
296
297 This might help diagnose strange hangs that rely on NMI to
298 function properly.
299
300 If unsure, say N.
301
287endmenu 302endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b02e509072a7..209ba1294592 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -118,6 +118,12 @@ KBUILD_CFLAGS += $(mflags-y)
118KBUILD_AFLAGS += $(mflags-y) 118KBUILD_AFLAGS += $(mflags-y)
119 119
120### 120###
121# Syscall table generation
122
123archheaders:
124 $(Q)$(MAKE) $(build)=arch/x86/syscalls all
125
126###
121# Kernel objects 127# Kernel objects
122 128
123head-y := arch/x86/kernel/head_$(BITS).o 129head-y := arch/x86/kernel/head_$(BITS).o
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 09664efb9cee..b123b9a8f5b3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T
23 23
24hostprogs-y := mkpiggy 24hostprogs-y := mkpiggy
25 25
26$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE 26VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
27 $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
28 $(obj)/piggy.o
29
30ifeq ($(CONFIG_EFI_STUB), y)
31 VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
32endif
33
34$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
27 $(call if_changed,ld) 35 $(call if_changed,ld)
28 @: 36 @:
29 37
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
new file mode 100644
index 000000000000..fec216f4fbc3
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.c
@@ -0,0 +1,1022 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2011 Intel Corporation; author Matt Fleming
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2.
7 *
8 * ----------------------------------------------------------------------- */
9
10#include <linux/efi.h>
11#include <asm/efi.h>
12#include <asm/setup.h>
13#include <asm/desc.h>
14
15#include "eboot.h"
16
17static efi_system_table_t *sys_table;
18
19static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
20 unsigned long *desc_size)
21{
22 efi_memory_desc_t *m = NULL;
23 efi_status_t status;
24 unsigned long key;
25 u32 desc_version;
26
27 *map_size = sizeof(*m) * 32;
28again:
29 /*
30 * Add an additional efi_memory_desc_t because we're doing an
31 * allocation which may be in a new descriptor region.
32 */
33 *map_size += sizeof(*m);
34 status = efi_call_phys3(sys_table->boottime->allocate_pool,
35 EFI_LOADER_DATA, *map_size, (void **)&m);
36 if (status != EFI_SUCCESS)
37 goto fail;
38
39 status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size,
40 m, &key, desc_size, &desc_version);
41 if (status == EFI_BUFFER_TOO_SMALL) {
42 efi_call_phys1(sys_table->boottime->free_pool, m);
43 goto again;
44 }
45
46 if (status != EFI_SUCCESS)
47 efi_call_phys1(sys_table->boottime->free_pool, m);
48
49fail:
50 *map = m;
51 return status;
52}
53
54/*
55 * Allocate at the highest possible address that is not above 'max'.
56 */
57static efi_status_t high_alloc(unsigned long size, unsigned long align,
58 unsigned long *addr, unsigned long max)
59{
60 unsigned long map_size, desc_size;
61 efi_memory_desc_t *map;
62 efi_status_t status;
63 unsigned long nr_pages;
64 u64 max_addr = 0;
65 int i;
66
67 status = __get_map(&map, &map_size, &desc_size);
68 if (status != EFI_SUCCESS)
69 goto fail;
70
71 nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
72again:
73 for (i = 0; i < map_size / desc_size; i++) {
74 efi_memory_desc_t *desc;
75 unsigned long m = (unsigned long)map;
76 u64 start, end;
77
78 desc = (efi_memory_desc_t *)(m + (i * desc_size));
79 if (desc->type != EFI_CONVENTIONAL_MEMORY)
80 continue;
81
82 if (desc->num_pages < nr_pages)
83 continue;
84
85 start = desc->phys_addr;
86 end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
87
88 if ((start + size) > end || (start + size) > max)
89 continue;
90
91 if (end - size > max)
92 end = max;
93
94 if (round_down(end - size, align) < start)
95 continue;
96
97 start = round_down(end - size, align);
98
99 /*
100 * Don't allocate at 0x0. It will confuse code that
101 * checks pointers against NULL.
102 */
103 if (start == 0x0)
104 continue;
105
106 if (start > max_addr)
107 max_addr = start;
108 }
109
110 if (!max_addr)
111 status = EFI_NOT_FOUND;
112 else {
113 status = efi_call_phys4(sys_table->boottime->allocate_pages,
114 EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
115 nr_pages, &max_addr);
116 if (status != EFI_SUCCESS) {
117 max = max_addr;
118 max_addr = 0;
119 goto again;
120 }
121
122 *addr = max_addr;
123 }
124
125free_pool:
126 efi_call_phys1(sys_table->boottime->free_pool, map);
127
128fail:
129 return status;
130}
131
132/*
133 * Allocate at the lowest possible address.
134 */
135static efi_status_t low_alloc(unsigned long size, unsigned long align,
136 unsigned long *addr)
137{
138 unsigned long map_size, desc_size;
139 efi_memory_desc_t *map;
140 efi_status_t status;
141 unsigned long nr_pages;
142 int i;
143
144 status = __get_map(&map, &map_size, &desc_size);
145 if (status != EFI_SUCCESS)
146 goto fail;
147
148 nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
149 for (i = 0; i < map_size / desc_size; i++) {
150 efi_memory_desc_t *desc;
151 unsigned long m = (unsigned long)map;
152 u64 start, end;
153
154 desc = (efi_memory_desc_t *)(m + (i * desc_size));
155
156 if (desc->type != EFI_CONVENTIONAL_MEMORY)
157 continue;
158
159 if (desc->num_pages < nr_pages)
160 continue;
161
162 start = desc->phys_addr;
163 end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
164
165 /*
166 * Don't allocate at 0x0. It will confuse code that
167 * checks pointers against NULL. Skip the first 8
168 * bytes so we start at a nice even number.
169 */
170 if (start == 0x0)
171 start += 8;
172
173 start = round_up(start, align);
174 if ((start + size) > end)
175 continue;
176
177 status = efi_call_phys4(sys_table->boottime->allocate_pages,
178 EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
179 nr_pages, &start);
180 if (status == EFI_SUCCESS) {
181 *addr = start;
182 break;
183 }
184 }
185
186 if (i == map_size / desc_size)
187 status = EFI_NOT_FOUND;
188
189free_pool:
190 efi_call_phys1(sys_table->boottime->free_pool, map);
191fail:
192 return status;
193}
194
195static void low_free(unsigned long size, unsigned long addr)
196{
197 unsigned long nr_pages;
198
199 nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
200 efi_call_phys2(sys_table->boottime->free_pages, addr, size);
201}
202
203static void find_bits(unsigned long mask, u8 *pos, u8 *size)
204{
205 u8 first, len;
206
207 first = 0;
208 len = 0;
209
210 if (mask) {
211 while (!(mask & 0x1)) {
212 mask = mask >> 1;
213 first++;
214 }
215
216 while (mask & 0x1) {
217 mask = mask >> 1;
218 len++;
219 }
220 }
221
222 *pos = first;
223 *size = len;
224}
225
226/*
227 * See if we have Graphics Output Protocol
228 */
229static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
230 unsigned long size)
231{
232 struct efi_graphics_output_protocol *gop, *first_gop;
233 struct efi_pixel_bitmask pixel_info;
234 unsigned long nr_gops;
235 efi_status_t status;
236 void **gop_handle;
237 u16 width, height;
238 u32 fb_base, fb_size;
239 u32 pixels_per_scan_line;
240 int pixel_format;
241 int i;
242
243 status = efi_call_phys3(sys_table->boottime->allocate_pool,
244 EFI_LOADER_DATA, size, &gop_handle);
245 if (status != EFI_SUCCESS)
246 return status;
247
248 status = efi_call_phys5(sys_table->boottime->locate_handle,
249 EFI_LOCATE_BY_PROTOCOL, proto,
250 NULL, &size, gop_handle);
251 if (status != EFI_SUCCESS)
252 goto free_handle;
253
254 first_gop = NULL;
255
256 nr_gops = size / sizeof(void *);
257 for (i = 0; i < nr_gops; i++) {
258 struct efi_graphics_output_mode_info *info;
259 efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
260 void *pciio;
261 void *h = gop_handle[i];
262
263 status = efi_call_phys3(sys_table->boottime->handle_protocol,
264 h, proto, &gop);
265 if (status != EFI_SUCCESS)
266 continue;
267
268 efi_call_phys3(sys_table->boottime->handle_protocol,
269 h, &pciio_proto, &pciio);
270
271 status = efi_call_phys4(gop->query_mode, gop,
272 gop->mode->mode, &size, &info);
273 if (status == EFI_SUCCESS && (!first_gop || pciio)) {
274 /*
275 * Apple provide GOPs that are not backed by
276 * real hardware (they're used to handle
277 * multiple displays). The workaround is to
278 * search for a GOP implementing the PCIIO
279 * protocol, and if one isn't found, to just
280 * fallback to the first GOP.
281 */
282 width = info->horizontal_resolution;
283 height = info->vertical_resolution;
284 fb_base = gop->mode->frame_buffer_base;
285 fb_size = gop->mode->frame_buffer_size;
286 pixel_format = info->pixel_format;
287 pixel_info = info->pixel_information;
288 pixels_per_scan_line = info->pixels_per_scan_line;
289
290 /*
291 * Once we've found a GOP supporting PCIIO,
292 * don't bother looking any further.
293 */
294 if (pciio)
295 break;
296
297 first_gop = gop;
298 }
299 }
300
301 /* Did we find any GOPs? */
302 if (!first_gop)
303 goto free_handle;
304
305 /* EFI framebuffer */
306 si->orig_video_isVGA = VIDEO_TYPE_EFI;
307
308 si->lfb_width = width;
309 si->lfb_height = height;
310 si->lfb_base = fb_base;
311 si->lfb_size = fb_size;
312 si->pages = 1;
313
314 if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
315 si->lfb_depth = 32;
316 si->lfb_linelength = pixels_per_scan_line * 4;
317 si->red_size = 8;
318 si->red_pos = 0;
319 si->green_size = 8;
320 si->green_pos = 8;
321 si->blue_size = 8;
322 si->blue_pos = 16;
323 si->rsvd_size = 8;
324 si->rsvd_pos = 24;
325 } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
326 si->lfb_depth = 32;
327 si->lfb_linelength = pixels_per_scan_line * 4;
328 si->red_size = 8;
329 si->red_pos = 16;
330 si->green_size = 8;
331 si->green_pos = 8;
332 si->blue_size = 8;
333 si->blue_pos = 0;
334 si->rsvd_size = 8;
335 si->rsvd_pos = 24;
336 } else if (pixel_format == PIXEL_BIT_MASK) {
337 find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
338 find_bits(pixel_info.green_mask, &si->green_pos,
339 &si->green_size);
340 find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
341 find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
342 &si->rsvd_size);
343 si->lfb_depth = si->red_size + si->green_size +
344 si->blue_size + si->rsvd_size;
345 si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
346 } else {
347 si->lfb_depth = 4;
348 si->lfb_linelength = si->lfb_width / 2;
349 si->red_size = 0;
350 si->red_pos = 0;
351 si->green_size = 0;
352 si->green_pos = 0;
353 si->blue_size = 0;
354 si->blue_pos = 0;
355 si->rsvd_size = 0;
356 si->rsvd_pos = 0;
357 }
358
359free_handle:
360 efi_call_phys1(sys_table->boottime->free_pool, gop_handle);
361 return status;
362}
363
364/*
365 * See if we have Universal Graphics Adapter (UGA) protocol
366 */
367static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
368 unsigned long size)
369{
370 struct efi_uga_draw_protocol *uga, *first_uga;
371 unsigned long nr_ugas;
372 efi_status_t status;
373 u32 width, height;
374 void **uga_handle = NULL;
375 int i;
376
377 status = efi_call_phys3(sys_table->boottime->allocate_pool,
378 EFI_LOADER_DATA, size, &uga_handle);
379 if (status != EFI_SUCCESS)
380 return status;
381
382 status = efi_call_phys5(sys_table->boottime->locate_handle,
383 EFI_LOCATE_BY_PROTOCOL, uga_proto,
384 NULL, &size, uga_handle);
385 if (status != EFI_SUCCESS)
386 goto free_handle;
387
388 first_uga = NULL;
389
390 nr_ugas = size / sizeof(void *);
391 for (i = 0; i < nr_ugas; i++) {
392 efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
393 void *handle = uga_handle[i];
394 u32 w, h, depth, refresh;
395 void *pciio;
396
397 status = efi_call_phys3(sys_table->boottime->handle_protocol,
398 handle, uga_proto, &uga);
399 if (status != EFI_SUCCESS)
400 continue;
401
402 efi_call_phys3(sys_table->boottime->handle_protocol,
403 handle, &pciio_proto, &pciio);
404
405 status = efi_call_phys5(uga->get_mode, uga, &w, &h,
406 &depth, &refresh);
407 if (status == EFI_SUCCESS && (!first_uga || pciio)) {
408 width = w;
409 height = h;
410
411 /*
412 * Once we've found a UGA supporting PCIIO,
413 * don't bother looking any further.
414 */
415 if (pciio)
416 break;
417
418 first_uga = uga;
419 }
420 }
421
422 if (!first_uga)
423 goto free_handle;
424
425 /* EFI framebuffer */
426 si->orig_video_isVGA = VIDEO_TYPE_EFI;
427
428 si->lfb_depth = 32;
429 si->lfb_width = width;
430 si->lfb_height = height;
431
432 si->red_size = 8;
433 si->red_pos = 16;
434 si->green_size = 8;
435 si->green_pos = 8;
436 si->blue_size = 8;
437 si->blue_pos = 0;
438 si->rsvd_size = 8;
439 si->rsvd_pos = 24;
440
441
442free_handle:
443 efi_call_phys1(sys_table->boottime->free_pool, uga_handle);
444 return status;
445}
446
447void setup_graphics(struct boot_params *boot_params)
448{
449 efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
450 struct screen_info *si;
451 efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
452 efi_status_t status;
453 unsigned long size;
454 void **gop_handle = NULL;
455 void **uga_handle = NULL;
456
457 si = &boot_params->screen_info;
458 memset(si, 0, sizeof(*si));
459
460 size = 0;
461 status = efi_call_phys5(sys_table->boottime->locate_handle,
462 EFI_LOCATE_BY_PROTOCOL, &graphics_proto,
463 NULL, &size, gop_handle);
464 if (status == EFI_BUFFER_TOO_SMALL)
465 status = setup_gop(si, &graphics_proto, size);
466
467 if (status != EFI_SUCCESS) {
468 size = 0;
469 status = efi_call_phys5(sys_table->boottime->locate_handle,
470 EFI_LOCATE_BY_PROTOCOL, &uga_proto,
471 NULL, &size, uga_handle);
472 if (status == EFI_BUFFER_TOO_SMALL)
473 setup_uga(si, &uga_proto, size);
474 }
475}
476
477struct initrd {
478 efi_file_handle_t *handle;
479 u64 size;
480};
481
482/*
483 * Check the cmdline for a LILO-style initrd= arguments.
484 *
485 * We only support loading an initrd from the same filesystem as the
486 * kernel image.
487 */
488static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
489 struct setup_header *hdr)
490{
491 struct initrd *initrds;
492 unsigned long initrd_addr;
493 efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
494 u64 initrd_total;
495 efi_file_io_interface_t *io;
496 efi_file_handle_t *fh;
497 efi_status_t status;
498 int nr_initrds;
499 char *str;
500 int i, j, k;
501
502 initrd_addr = 0;
503 initrd_total = 0;
504
505 str = (char *)(unsigned long)hdr->cmd_line_ptr;
506
507 j = 0; /* See close_handles */
508
509 if (!str || !*str)
510 return EFI_SUCCESS;
511
512 for (nr_initrds = 0; *str; nr_initrds++) {
513 str = strstr(str, "initrd=");
514 if (!str)
515 break;
516
517 str += 7;
518
519 /* Skip any leading slashes */
520 while (*str == '/' || *str == '\\')
521 str++;
522
523 while (*str && *str != ' ' && *str != '\n')
524 str++;
525 }
526
527 if (!nr_initrds)
528 return EFI_SUCCESS;
529
530 status = efi_call_phys3(sys_table->boottime->allocate_pool,
531 EFI_LOADER_DATA,
532 nr_initrds * sizeof(*initrds),
533 &initrds);
534 if (status != EFI_SUCCESS)
535 goto fail;
536
537 str = (char *)(unsigned long)hdr->cmd_line_ptr;
538 for (i = 0; i < nr_initrds; i++) {
539 struct initrd *initrd;
540 efi_file_handle_t *h;
541 efi_file_info_t *info;
542 efi_char16_t filename[256];
543 unsigned long info_sz;
544 efi_guid_t info_guid = EFI_FILE_INFO_ID;
545 efi_char16_t *p;
546 u64 file_sz;
547
548 str = strstr(str, "initrd=");
549 if (!str)
550 break;
551
552 str += 7;
553
554 initrd = &initrds[i];
555 p = filename;
556
557 /* Skip any leading slashes */
558 while (*str == '/' || *str == '\\')
559 str++;
560
561 while (*str && *str != ' ' && *str != '\n') {
562 if (p >= filename + sizeof(filename))
563 break;
564
565 *p++ = *str++;
566 }
567
568 *p = '\0';
569
570 /* Only open the volume once. */
571 if (!i) {
572 efi_boot_services_t *boottime;
573
574 boottime = sys_table->boottime;
575
576 status = efi_call_phys3(boottime->handle_protocol,
577 image->device_handle, &fs_proto, &io);
578 if (status != EFI_SUCCESS)
579 goto free_initrds;
580
581 status = efi_call_phys2(io->open_volume, io, &fh);
582 if (status != EFI_SUCCESS)
583 goto free_initrds;
584 }
585
586 status = efi_call_phys5(fh->open, fh, &h, filename,
587 EFI_FILE_MODE_READ, (u64)0);
588 if (status != EFI_SUCCESS)
589 goto close_handles;
590
591 initrd->handle = h;
592
593 info_sz = 0;
594 status = efi_call_phys4(h->get_info, h, &info_guid,
595 &info_sz, NULL);
596 if (status != EFI_BUFFER_TOO_SMALL)
597 goto close_handles;
598
599grow:
600 status = efi_call_phys3(sys_table->boottime->allocate_pool,
601 EFI_LOADER_DATA, info_sz, &info);
602 if (status != EFI_SUCCESS)
603 goto close_handles;
604
605 status = efi_call_phys4(h->get_info, h, &info_guid,
606 &info_sz, info);
607 if (status == EFI_BUFFER_TOO_SMALL) {
608 efi_call_phys1(sys_table->boottime->free_pool, info);
609 goto grow;
610 }
611
612 file_sz = info->file_size;
613 efi_call_phys1(sys_table->boottime->free_pool, info);
614
615 if (status != EFI_SUCCESS)
616 goto close_handles;
617
618 initrd->size = file_sz;
619 initrd_total += file_sz;
620 }
621
622 if (initrd_total) {
623 unsigned long addr;
624
625 /*
626 * Multiple initrd's need to be at consecutive
627 * addresses in memory, so allocate enough memory for
628 * all the initrd's.
629 */
630 status = high_alloc(initrd_total, 0x1000,
631 &initrd_addr, hdr->initrd_addr_max);
632 if (status != EFI_SUCCESS)
633 goto close_handles;
634
635 /* We've run out of free low memory. */
636 if (initrd_addr > hdr->initrd_addr_max) {
637 status = EFI_INVALID_PARAMETER;
638 goto free_initrd_total;
639 }
640
641 addr = initrd_addr;
642 for (j = 0; j < nr_initrds; j++) {
643 u64 size;
644
645 size = initrds[j].size;
646 while (size) {
647 u64 chunksize;
648 if (size > EFI_READ_CHUNK_SIZE)
649 chunksize = EFI_READ_CHUNK_SIZE;
650 else
651 chunksize = size;
652 status = efi_call_phys3(fh->read,
653 initrds[j].handle,
654 &chunksize, addr);
655 if (status != EFI_SUCCESS)
656 goto free_initrd_total;
657 addr += chunksize;
658 size -= chunksize;
659 }
660
661 efi_call_phys1(fh->close, initrds[j].handle);
662 }
663
664 }
665
666 efi_call_phys1(sys_table->boottime->free_pool, initrds);
667
668 hdr->ramdisk_image = initrd_addr;
669 hdr->ramdisk_size = initrd_total;
670
671 return status;
672
673free_initrd_total:
674 low_free(initrd_total, initrd_addr);
675
676close_handles:
677 for (k = j; k < nr_initrds; k++)
678 efi_call_phys1(fh->close, initrds[k].handle);
679free_initrds:
680 efi_call_phys1(sys_table->boottime->free_pool, initrds);
681fail:
682 hdr->ramdisk_image = 0;
683 hdr->ramdisk_size = 0;
684
685 return status;
686}
687
688/*
689 * Because the x86 boot code expects to be passed a boot_params we
690 * need to create one ourselves (usually the bootloader would create
691 * one for us).
692 */
693static efi_status_t make_boot_params(struct boot_params *boot_params,
694 efi_loaded_image_t *image,
695 void *handle)
696{
697 struct efi_info *efi = &boot_params->efi_info;
698 struct apm_bios_info *bi = &boot_params->apm_bios_info;
699 struct sys_desc_table *sdt = &boot_params->sys_desc_table;
700 struct e820entry *e820_map = &boot_params->e820_map[0];
701 struct e820entry *prev = NULL;
702 struct setup_header *hdr = &boot_params->hdr;
703 unsigned long size, key, desc_size, _size;
704 efi_memory_desc_t *mem_map;
705 void *options = image->load_options;
706 u32 load_options_size = image->load_options_size / 2; /* ASCII */
707 int options_size = 0;
708 efi_status_t status;
709 __u32 desc_version;
710 unsigned long cmdline;
711 u8 nr_entries;
712 u16 *s2;
713 u8 *s1;
714 int i;
715
716 hdr->type_of_loader = 0x21;
717
718 /* Convert unicode cmdline to ascii */
719 cmdline = 0;
720 s2 = (u16 *)options;
721
722 if (s2) {
723 while (*s2 && *s2 != '\n' && options_size < load_options_size) {
724 s2++;
725 options_size++;
726 }
727
728 if (options_size) {
729 if (options_size > hdr->cmdline_size)
730 options_size = hdr->cmdline_size;
731
732 options_size++; /* NUL termination */
733
734 status = low_alloc(options_size, 1, &cmdline);
735 if (status != EFI_SUCCESS)
736 goto fail;
737
738 s1 = (u8 *)(unsigned long)cmdline;
739 s2 = (u16 *)options;
740
741 for (i = 0; i < options_size - 1; i++)
742 *s1++ = *s2++;
743
744 *s1 = '\0';
745 }
746 }
747
748 hdr->cmd_line_ptr = cmdline;
749
750 hdr->ramdisk_image = 0;
751 hdr->ramdisk_size = 0;
752
753 status = handle_ramdisks(image, hdr);
754 if (status != EFI_SUCCESS)
755 goto free_cmdline;
756
757 setup_graphics(boot_params);
758
759 /* Clear APM BIOS info */
760 memset(bi, 0, sizeof(*bi));
761
762 memset(sdt, 0, sizeof(*sdt));
763
764 memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
765
766 size = sizeof(*mem_map) * 32;
767
768again:
769 size += sizeof(*mem_map);
770 _size = size;
771 status = low_alloc(size, 1, (unsigned long *)&mem_map);
772 if (status != EFI_SUCCESS)
773 goto free_cmdline;
774
775 status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
776 mem_map, &key, &desc_size, &desc_version);
777 if (status == EFI_BUFFER_TOO_SMALL) {
778 low_free(_size, (unsigned long)mem_map);
779 goto again;
780 }
781
782 if (status != EFI_SUCCESS)
783 goto free_mem_map;
784
785 efi->efi_systab = (unsigned long)sys_table;
786 efi->efi_memdesc_size = desc_size;
787 efi->efi_memdesc_version = desc_version;
788 efi->efi_memmap = (unsigned long)mem_map;
789 efi->efi_memmap_size = size;
790
791#ifdef CONFIG_X86_64
792 efi->efi_systab_hi = (unsigned long)sys_table >> 32;
793 efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
794#endif
795
796 /* Might as well exit boot services now */
797 status = efi_call_phys2(sys_table->boottime->exit_boot_services,
798 handle, key);
799 if (status != EFI_SUCCESS)
800 goto free_mem_map;
801
802 /* Historic? */
803 boot_params->alt_mem_k = 32 * 1024;
804
805 /*
806 * Convert the EFI memory map to E820.
807 */
808 nr_entries = 0;
809 for (i = 0; i < size / desc_size; i++) {
810 efi_memory_desc_t *d;
811 unsigned int e820_type = 0;
812 unsigned long m = (unsigned long)mem_map;
813
814 d = (efi_memory_desc_t *)(m + (i * desc_size));
815 switch (d->type) {
816 case EFI_RESERVED_TYPE:
817 case EFI_RUNTIME_SERVICES_CODE:
818 case EFI_RUNTIME_SERVICES_DATA:
819 case EFI_MEMORY_MAPPED_IO:
820 case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
821 case EFI_PAL_CODE:
822 e820_type = E820_RESERVED;
823 break;
824
825 case EFI_UNUSABLE_MEMORY:
826 e820_type = E820_UNUSABLE;
827 break;
828
829 case EFI_ACPI_RECLAIM_MEMORY:
830 e820_type = E820_ACPI;
831 break;
832
833 case EFI_LOADER_CODE:
834 case EFI_LOADER_DATA:
835 case EFI_BOOT_SERVICES_CODE:
836 case EFI_BOOT_SERVICES_DATA:
837 case EFI_CONVENTIONAL_MEMORY:
838 e820_type = E820_RAM;
839 break;
840
841 case EFI_ACPI_MEMORY_NVS:
842 e820_type = E820_NVS;
843 break;
844
845 default:
846 continue;
847 }
848
849 /* Merge adjacent mappings */
850 if (prev && prev->type == e820_type &&
851 (prev->addr + prev->size) == d->phys_addr)
852 prev->size += d->num_pages << 12;
853 else {
854 e820_map->addr = d->phys_addr;
855 e820_map->size = d->num_pages << 12;
856 e820_map->type = e820_type;
857 prev = e820_map++;
858 nr_entries++;
859 }
860 }
861
862 boot_params->e820_entries = nr_entries;
863
864 return EFI_SUCCESS;
865
866free_mem_map:
867 low_free(_size, (unsigned long)mem_map);
868free_cmdline:
869 if (options_size)
870 low_free(options_size, hdr->cmd_line_ptr);
871fail:
872 return status;
873}
874
875/*
876 * On success we return a pointer to a boot_params structure, and NULL
877 * on failure.
878 */
879struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
880{
881 struct boot_params *boot_params;
882 unsigned long start, nr_pages;
883 struct desc_ptr *gdt, *idt;
884 efi_loaded_image_t *image;
885 struct setup_header *hdr;
886 efi_status_t status;
887 efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
888 struct desc_struct *desc;
889
890 sys_table = _table;
891
892 /* Check if we were booted by the EFI firmware */
893 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
894 goto fail;
895
896 status = efi_call_phys3(sys_table->boottime->handle_protocol,
897 handle, &proto, (void *)&image);
898 if (status != EFI_SUCCESS)
899 goto fail;
900
901 status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
902 if (status != EFI_SUCCESS)
903 goto fail;
904
905 memset(boot_params, 0x0, 0x4000);
906
907 /* Copy first two sectors to boot_params */
908 memcpy(boot_params, image->image_base, 1024);
909
910 hdr = &boot_params->hdr;
911
912 /*
913 * The EFI firmware loader could have placed the kernel image
914 * anywhere in memory, but the kernel has various restrictions
915 * on the max physical address it can run at. Attempt to move
916 * the kernel to boot_params.pref_address, or as low as
917 * possible.
918 */
919 start = hdr->pref_address;
920 nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
921
922 status = efi_call_phys4(sys_table->boottime->allocate_pages,
923 EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
924 nr_pages, &start);
925 if (status != EFI_SUCCESS) {
926 status = low_alloc(hdr->init_size, hdr->kernel_alignment,
927 &start);
928 if (status != EFI_SUCCESS)
929 goto fail;
930 }
931
932 hdr->code32_start = (__u32)start;
933 hdr->pref_address = (__u64)(unsigned long)image->image_base;
934
935 memcpy((void *)start, image->image_base, image->image_size);
936
937 status = efi_call_phys3(sys_table->boottime->allocate_pool,
938 EFI_LOADER_DATA, sizeof(*gdt),
939 (void **)&gdt);
940 if (status != EFI_SUCCESS)
941 goto fail;
942
943 gdt->size = 0x800;
944 status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
945 if (status != EFI_SUCCESS)
946 goto fail;
947
948 status = efi_call_phys3(sys_table->boottime->allocate_pool,
949 EFI_LOADER_DATA, sizeof(*idt),
950 (void **)&idt);
951 if (status != EFI_SUCCESS)
952 goto fail;
953
954 idt->size = 0;
955 idt->address = 0;
956
957 status = make_boot_params(boot_params, image, handle);
958 if (status != EFI_SUCCESS)
959 goto fail;
960
961 memset((char *)gdt->address, 0x0, gdt->size);
962 desc = (struct desc_struct *)gdt->address;
963
964 /* The first GDT is a dummy and the second is unused. */
965 desc += 2;
966
967 desc->limit0 = 0xffff;
968 desc->base0 = 0x0000;
969 desc->base1 = 0x0000;
970 desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
971 desc->s = DESC_TYPE_CODE_DATA;
972 desc->dpl = 0;
973 desc->p = 1;
974 desc->limit = 0xf;
975 desc->avl = 0;
976 desc->l = 0;
977 desc->d = SEG_OP_SIZE_32BIT;
978 desc->g = SEG_GRANULARITY_4KB;
979 desc->base2 = 0x00;
980
981 desc++;
982 desc->limit0 = 0xffff;
983 desc->base0 = 0x0000;
984 desc->base1 = 0x0000;
985 desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
986 desc->s = DESC_TYPE_CODE_DATA;
987 desc->dpl = 0;
988 desc->p = 1;
989 desc->limit = 0xf;
990 desc->avl = 0;
991 desc->l = 0;
992 desc->d = SEG_OP_SIZE_32BIT;
993 desc->g = SEG_GRANULARITY_4KB;
994 desc->base2 = 0x00;
995
996#ifdef CONFIG_X86_64
997 /* Task segment value */
998 desc++;
999 desc->limit0 = 0x0000;
1000 desc->base0 = 0x0000;
1001 desc->base1 = 0x0000;
1002 desc->type = SEG_TYPE_TSS;
1003 desc->s = 0;
1004 desc->dpl = 0;
1005 desc->p = 1;
1006 desc->limit = 0x0;
1007 desc->avl = 0;
1008 desc->l = 0;
1009 desc->d = 0;
1010 desc->g = SEG_GRANULARITY_4KB;
1011 desc->base2 = 0x00;
1012#endif /* CONFIG_X86_64 */
1013
1014 asm volatile ("lidt %0" : : "m" (*idt));
1015 asm volatile ("lgdt %0" : : "m" (*gdt));
1016
1017 asm volatile("cli");
1018
1019 return boot_params;
1020fail:
1021 return NULL;
1022}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
new file mode 100644
index 000000000000..39251663e65b
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.h
@@ -0,0 +1,61 @@
1#ifndef BOOT_COMPRESSED_EBOOT_H
2#define BOOT_COMPRESSED_EBOOT_H
3
4#define SEG_TYPE_DATA (0 << 3)
5#define SEG_TYPE_READ_WRITE (1 << 1)
6#define SEG_TYPE_CODE (1 << 3)
7#define SEG_TYPE_EXEC_READ (1 << 1)
8#define SEG_TYPE_TSS ((1 << 3) | (1 << 0))
9#define SEG_OP_SIZE_32BIT (1 << 0)
10#define SEG_GRANULARITY_4KB (1 << 0)
11
12#define DESC_TYPE_CODE_DATA (1 << 0)
13
14#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT)
15#define EFI_READ_CHUNK_SIZE (1024 * 1024)
16
17#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0
18#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1
19#define PIXEL_BIT_MASK 2
20#define PIXEL_BLT_ONLY 3
21#define PIXEL_FORMAT_MAX 4
22
23struct efi_pixel_bitmask {
24 u32 red_mask;
25 u32 green_mask;
26 u32 blue_mask;
27 u32 reserved_mask;
28};
29
30struct efi_graphics_output_mode_info {
31 u32 version;
32 u32 horizontal_resolution;
33 u32 vertical_resolution;
34 int pixel_format;
35 struct efi_pixel_bitmask pixel_information;
36 u32 pixels_per_scan_line;
37} __packed;
38
39struct efi_graphics_output_protocol_mode {
40 u32 max_mode;
41 u32 mode;
42 unsigned long info;
43 unsigned long size_of_info;
44 u64 frame_buffer_base;
45 unsigned long frame_buffer_size;
46} __packed;
47
48struct efi_graphics_output_protocol {
49 void *query_mode;
50 unsigned long set_mode;
51 unsigned long blt;
52 struct efi_graphics_output_protocol_mode *mode;
53};
54
55struct efi_uga_draw_protocol {
56 void *get_mode;
57 void *set_mode;
58 void *blt;
59};
60
61#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
new file mode 100644
index 000000000000..a53440e81d52
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_32.S
@@ -0,0 +1,86 @@
1/*
2 * EFI call stub for IA32.
3 *
4 * This stub allows us to make EFI calls in physical mode with interrupts
5 * turned off. Note that this implementation is different from the one in
6 * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
7 * mode at this point.
8 */
9
10#include <linux/linkage.h>
11#include <asm/page_types.h>
12
13/*
14 * efi_call_phys(void *, ...) is a function with variable parameters.
15 * All the callers of this function assure that all the parameters are 4-bytes.
16 */
17
18/*
19 * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
20 * So we'd better save all of them at the beginning of this function and restore
21 * at the end no matter how many we use, because we can not assure EFI runtime
22 * service functions will comply with gcc calling convention, too.
23 */
24
25.text
26ENTRY(efi_call_phys)
27 /*
28 * 0. The function can only be called in Linux kernel. So CS has been
29 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
30 * the values of these registers are the same. And, the corresponding
31 * GDT entries are identical. So I will do nothing about segment reg
32 * and GDT, but change GDT base register in prelog and epilog.
33 */
34
35 /*
36 * 1. Because we haven't been relocated by this point we need to
37 * use relative addressing.
38 */
39 call 1f
401: popl %edx
41 subl $1b, %edx
42
43 /*
44 * 2. Now on the top of stack is the return
45 * address in the caller of efi_call_phys(), then parameter 1,
46 * parameter 2, ..., param n. To make things easy, we save the return
47 * address of efi_call_phys in a global variable.
48 */
49 popl %ecx
50 movl %ecx, saved_return_addr(%edx)
51 /* get the function pointer into ECX*/
52 popl %ecx
53 movl %ecx, efi_rt_function_ptr(%edx)
54
55 /*
56 * 3. Call the physical function.
57 */
58 call *%ecx
59
60 /*
61 * 4. Balance the stack. And because EAX contain the return value,
62 * we'd better not clobber it. We need to calculate our address
63 * again because %ecx and %edx are not preserved across EFI function
64 * calls.
65 */
66 call 1f
671: popl %edx
68 subl $1b, %edx
69
70 movl efi_rt_function_ptr(%edx), %ecx
71 pushl %ecx
72
73 /*
74 * 10. Push the saved return address onto the stack and return.
75 */
76 movl saved_return_addr(%edx), %ecx
77 pushl %ecx
78 ret
79ENDPROC(efi_call_phys)
80.previous
81
82.data
83saved_return_addr:
84 .long 0
85efi_rt_function_ptr:
86 .long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
new file mode 100644
index 000000000000..cedc60de86eb
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -0,0 +1 @@
#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 67a655a39ce4..a0559930a180 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -32,6 +32,28 @@
32 32
33 __HEAD 33 __HEAD
34ENTRY(startup_32) 34ENTRY(startup_32)
35#ifdef CONFIG_EFI_STUB
36 /*
37 * We don't need the return address, so set up the stack so
38 * efi_main() can find its arugments.
39 */
40 add $0x4, %esp
41
42 call efi_main
43 cmpl $0, %eax
44 je preferred_addr
45 movl %eax, %esi
46 call 1f
471:
48 popl %eax
49 subl $1b, %eax
50 subl BP_pref_address(%esi), %eax
51 add BP_code32_start(%esi), %eax
52 leal preferred_addr(%eax), %eax
53 jmp *%eax
54
55preferred_addr:
56#endif
35 cld 57 cld
36 /* 58 /*
37 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 59 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 35af09d13dc1..558d76ce23bc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -199,6 +199,26 @@ ENTRY(startup_64)
199 * an identity mapped page table being provied that maps our 199 * an identity mapped page table being provied that maps our
200 * entire text+data+bss and hopefully all of memory. 200 * entire text+data+bss and hopefully all of memory.
201 */ 201 */
202#ifdef CONFIG_EFI_STUB
203 pushq %rsi
204 mov %rcx, %rdi
205 mov %rdx, %rsi
206 call efi_main
207 popq %rsi
208 cmpq $0,%rax
209 je preferred_addr
210 movq %rax,%rsi
211 call 1f
2121:
213 popq %rax
214 subq $1b, %rax
215 subq BP_pref_address(%rsi), %rax
216 add BP_code32_start(%esi), %eax
217 leaq preferred_addr(%rax), %rax
218 jmp *%rax
219
220preferred_addr:
221#endif
202 222
203 /* Setup data segments. */ 223 /* Setup data segments. */
204 xorl %eax, %eax 224 xorl %eax, %eax
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 19b3e693cd72..ffb9c5c9d748 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,2 +1,11 @@
1#include "misc.h" 1#include "misc.h"
2
3int memcmp(const void *s1, const void *s2, size_t len)
4{
5 u8 diff;
6 asm("repe; cmpsb; setnz %0"
7 : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
8 return diff;
9}
10
2#include "../string.c" 11#include "../string.c"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index bdb4d458ec8c..f1bbeeb09148 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -45,6 +45,11 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
45 45
46 .global bootsect_start 46 .global bootsect_start
47bootsect_start: 47bootsect_start:
48#ifdef CONFIG_EFI_STUB
49 # "MZ", MS-DOS header
50 .byte 0x4d
51 .byte 0x5a
52#endif
48 53
49 # Normalize the start address 54 # Normalize the start address
50 ljmp $BOOTSEG, $start2 55 ljmp $BOOTSEG, $start2
@@ -79,6 +84,14 @@ bs_die:
79 # invoke the BIOS reset code... 84 # invoke the BIOS reset code...
80 ljmp $0xf000,$0xfff0 85 ljmp $0xf000,$0xfff0
81 86
87#ifdef CONFIG_EFI_STUB
88 .org 0x3c
89 #
90 # Offset to the PE header.
91 #
92 .long pe_header
93#endif /* CONFIG_EFI_STUB */
94
82 .section ".bsdata", "a" 95 .section ".bsdata", "a"
83bugger_off_msg: 96bugger_off_msg:
84 .ascii "Direct booting from floppy is no longer supported.\r\n" 97 .ascii "Direct booting from floppy is no longer supported.\r\n"
@@ -87,6 +100,141 @@ bugger_off_msg:
87 .ascii "Remove disk and press any key to reboot . . .\r\n" 100 .ascii "Remove disk and press any key to reboot . . .\r\n"
88 .byte 0 101 .byte 0
89 102
103#ifdef CONFIG_EFI_STUB
104pe_header:
105 .ascii "PE"
106 .word 0
107
108coff_header:
109#ifdef CONFIG_X86_32
110 .word 0x14c # i386
111#else
112 .word 0x8664 # x86-64
113#endif
114 .word 2 # nr_sections
115 .long 0 # TimeDateStamp
116 .long 0 # PointerToSymbolTable
117 .long 1 # NumberOfSymbols
118 .word section_table - optional_header # SizeOfOptionalHeader
119#ifdef CONFIG_X86_32
120 .word 0x306 # Characteristics.
121 # IMAGE_FILE_32BIT_MACHINE |
122 # IMAGE_FILE_DEBUG_STRIPPED |
123 # IMAGE_FILE_EXECUTABLE_IMAGE |
124 # IMAGE_FILE_LINE_NUMS_STRIPPED
125#else
126 .word 0x206 # Characteristics
127 # IMAGE_FILE_DEBUG_STRIPPED |
128 # IMAGE_FILE_EXECUTABLE_IMAGE |
129 # IMAGE_FILE_LINE_NUMS_STRIPPED
130#endif
131
132optional_header:
133#ifdef CONFIG_X86_32
134 .word 0x10b # PE32 format
135#else
136 .word 0x20b # PE32+ format
137#endif
138 .byte 0x02 # MajorLinkerVersion
139 .byte 0x14 # MinorLinkerVersion
140
141 # Filled in by build.c
142 .long 0 # SizeOfCode
143
144 .long 0 # SizeOfInitializedData
145 .long 0 # SizeOfUninitializedData
146
147 # Filled in by build.c
148 .long 0x0000 # AddressOfEntryPoint
149
150 .long 0x0000 # BaseOfCode
151#ifdef CONFIG_X86_32
152 .long 0 # data
153#endif
154
155extra_header_fields:
156#ifdef CONFIG_X86_32
157 .long 0 # ImageBase
158#else
159 .quad 0 # ImageBase
160#endif
161 .long 0x1000 # SectionAlignment
162 .long 0x200 # FileAlignment
163 .word 0 # MajorOperatingSystemVersion
164 .word 0 # MinorOperatingSystemVersion
165 .word 0 # MajorImageVersion
166 .word 0 # MinorImageVersion
167 .word 0 # MajorSubsystemVersion
168 .word 0 # MinorSubsystemVersion
169 .long 0 # Win32VersionValue
170
171 #
172 # The size of the bzImage is written in tools/build.c
173 #
174 .long 0 # SizeOfImage
175
176 .long 0x200 # SizeOfHeaders
177 .long 0 # CheckSum
178 .word 0xa # Subsystem (EFI application)
179 .word 0 # DllCharacteristics
180#ifdef CONFIG_X86_32
181 .long 0 # SizeOfStackReserve
182 .long 0 # SizeOfStackCommit
183 .long 0 # SizeOfHeapReserve
184 .long 0 # SizeOfHeapCommit
185#else
186 .quad 0 # SizeOfStackReserve
187 .quad 0 # SizeOfStackCommit
188 .quad 0 # SizeOfHeapReserve
189 .quad 0 # SizeOfHeapCommit
190#endif
191 .long 0 # LoaderFlags
192 .long 0x1 # NumberOfRvaAndSizes
193
194 .quad 0 # ExportTable
195 .quad 0 # ImportTable
196 .quad 0 # ResourceTable
197 .quad 0 # ExceptionTable
198 .quad 0 # CertificationTable
199 .quad 0 # BaseRelocationTable
200
201 # Section table
202section_table:
203 .ascii ".text"
204 .byte 0
205 .byte 0
206 .byte 0
207 .long 0
208 .long 0x0 # startup_{32,64}
209 .long 0 # Size of initialized data
210 # on disk
211 .long 0x0 # startup_{32,64}
212 .long 0 # PointerToRelocations
213 .long 0 # PointerToLineNumbers
214 .word 0 # NumberOfRelocations
215 .word 0 # NumberOfLineNumbers
216 .long 0x60500020 # Characteristics (section flags)
217
218 #
219 # The EFI application loader requires a relocation section
220 # because EFI applications are relocatable and not having
221 # this section seems to confuse it. But since we don't need
222 # the loader to fixup any relocs for us just fill it with a
223 # single dummy reloc.
224 #
225 .ascii ".reloc"
226 .byte 0
227 .byte 0
228 .long reloc_end - reloc_start
229 .long reloc_start
230 .long reloc_end - reloc_start # SizeOfRawData
231 .long reloc_start # PointerToRawData
232 .long 0 # PointerToRelocations
233 .long 0 # PointerToLineNumbers
234 .word 0 # NumberOfRelocations
235 .word 0 # NumberOfLineNumbers
236 .long 0x42100040 # Characteristics (section flags)
237#endif /* CONFIG_EFI_STUB */
90 238
91 # Kernel attributes; used by setup. This is part 1 of the 239 # Kernel attributes; used by setup. This is part 1 of the
92 # header, from the old boot sector. 240 # header, from the old boot sector.
@@ -318,3 +466,13 @@ die:
318setup_corrupt: 466setup_corrupt:
319 .byte 7 467 .byte 7
320 .string "No setup signature found...\n" 468 .string "No setup signature found...\n"
469
470 .data
471dummy: .long 0
472
473 .section .reloc
474reloc_start:
475 .long dummy - reloc_start
476 .long 10
477 .word 0
478reloc_end:
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 3cbc4058dd26..574dedfe2890 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
111 111
112 return result; 112 return result;
113} 113}
114
115/**
116 * strlen - Find the length of a string
117 * @s: The string to be sized
118 */
119size_t strlen(const char *s)
120{
121 const char *sc;
122
123 for (sc = s; *sc != '\0'; ++sc)
124 /* nothing */;
125 return sc - s;
126}
127
128/**
129 * strstr - Find the first substring in a %NUL terminated string
130 * @s1: The string to be searched
131 * @s2: The string to search for
132 */
133char *strstr(const char *s1, const char *s2)
134{
135 size_t l1, l2;
136
137 l2 = strlen(s2);
138 if (!l2)
139 return (char *)s1;
140 l1 = strlen(s1);
141 while (l1 >= l2) {
142 l1--;
143 if (!memcmp(s1, s2, l2))
144 return (char *)s1;
145 s1++;
146 }
147 return NULL;
148}
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index fdc60a0b3c20..4e9bd6bcafa6 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -135,6 +135,9 @@ static void usage(void)
135 135
136int main(int argc, char ** argv) 136int main(int argc, char ** argv)
137{ 137{
138#ifdef CONFIG_EFI_STUB
139 unsigned int file_sz, pe_header;
140#endif
138 unsigned int i, sz, setup_sectors; 141 unsigned int i, sz, setup_sectors;
139 int c; 142 int c;
140 u32 sys_size; 143 u32 sys_size;
@@ -194,6 +197,42 @@ int main(int argc, char ** argv)
194 buf[0x1f6] = sys_size >> 16; 197 buf[0x1f6] = sys_size >> 16;
195 buf[0x1f7] = sys_size >> 24; 198 buf[0x1f7] = sys_size >> 24;
196 199
200#ifdef CONFIG_EFI_STUB
201 file_sz = sz + i + ((sys_size * 16) - sz);
202
203 pe_header = *(unsigned int *)&buf[0x3c];
204
205 /* Size of code */
206 *(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
207
208 /* Size of image */
209 *(unsigned int *)&buf[pe_header + 0x50] = file_sz;
210
211#ifdef CONFIG_X86_32
212 /* Address of entry point */
213 *(unsigned int *)&buf[pe_header + 0x28] = i;
214
215 /* .text size */
216 *(unsigned int *)&buf[pe_header + 0xb0] = file_sz;
217
218 /* .text size of initialised data */
219 *(unsigned int *)&buf[pe_header + 0xb8] = file_sz;
220#else
221 /*
222 * Address of entry point. startup_32 is at the beginning and
223 * the 64-bit entry point (startup_64) is always 512 bytes
224 * after.
225 */
226 *(unsigned int *)&buf[pe_header + 0x28] = i + 512;
227
228 /* .text size */
229 *(unsigned int *)&buf[pe_header + 0xc0] = file_sz;
230
231 /* .text size of initialised data */
232 *(unsigned int *)&buf[pe_header + 0xc8] = file_sz;
233#endif /* CONFIG_X86_32 */
234#endif /* CONFIG_EFI_STUB */
235
197 crc = partial_crc32(buf, i, crc); 236 crc = partial_crc32(buf, i, crc);
198 if (fwrite(buf, 1, i, stdout) != i) 237 if (fwrite(buf, 1, i, stdout) != i)
199 die("Writing setup failed"); 238 die("Writing setup failed");
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3537d4b91f74..2b0b9631474b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,12 +5,14 @@
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
8obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
8 9
9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 10obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 11obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
11obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 13obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 14obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
15obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 16obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 17obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
16 18
@@ -20,12 +22,14 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
20aes-i586-y := aes-i586-asm_32.o aes_glue.o 22aes-i586-y := aes-i586-asm_32.o aes_glue.o
21twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 23twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
22salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 24salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
25serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
23 26
24aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 27aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
25blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 28blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
26twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 29twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
27twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 30twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
28salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 31salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
32serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
29 33
30aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 34aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
31 35
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 000000000000..4e37677ca851
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,638 @@
1/*
2 * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-i586-asm_32.S"
28.text
29
30#define arg_ctx 4
31#define arg_dst 8
32#define arg_src 12
33#define arg_xor 16
34
35/**********************************************************************
36 4-way SSE2 serpent
37 **********************************************************************/
38#define CTX %edx
39
40#define RA %xmm0
41#define RB %xmm1
42#define RC %xmm2
43#define RD %xmm3
44#define RE %xmm4
45
46#define RT0 %xmm5
47#define RT1 %xmm6
48
49#define RNOT %xmm7
50
51#define get_key(i, j, t) \
52 movd (4*(i)+(j))*4(CTX), t; \
53 pshufd $0, t, t;
54
55#define K(x0, x1, x2, x3, x4, i) \
56 get_key(i, 0, x4); \
57 get_key(i, 1, RT0); \
58 get_key(i, 2, RT1); \
59 pxor x4, x0; \
60 pxor RT0, x1; \
61 pxor RT1, x2; \
62 get_key(i, 3, x4); \
63 pxor x4, x3;
64
65#define LK(x0, x1, x2, x3, x4, i) \
66 movdqa x0, x4; \
67 pslld $13, x0; \
68 psrld $(32 - 13), x4; \
69 por x4, x0; \
70 pxor x0, x1; \
71 movdqa x2, x4; \
72 pslld $3, x2; \
73 psrld $(32 - 3), x4; \
74 por x4, x2; \
75 pxor x2, x1; \
76 movdqa x1, x4; \
77 pslld $1, x1; \
78 psrld $(32 - 1), x4; \
79 por x4, x1; \
80 movdqa x0, x4; \
81 pslld $3, x4; \
82 pxor x2, x3; \
83 pxor x4, x3; \
84 movdqa x3, x4; \
85 pslld $7, x3; \
86 psrld $(32 - 7), x4; \
87 por x4, x3; \
88 movdqa x1, x4; \
89 pslld $7, x4; \
90 pxor x1, x0; \
91 pxor x3, x0; \
92 pxor x3, x2; \
93 pxor x4, x2; \
94 movdqa x0, x4; \
95 get_key(i, 1, RT0); \
96 pxor RT0, x1; \
97 get_key(i, 3, RT0); \
98 pxor RT0, x3; \
99 pslld $5, x0; \
100 psrld $(32 - 5), x4; \
101 por x4, x0; \
102 movdqa x2, x4; \
103 pslld $22, x2; \
104 psrld $(32 - 22), x4; \
105 por x4, x2; \
106 get_key(i, 0, RT0); \
107 pxor RT0, x0; \
108 get_key(i, 2, RT0); \
109 pxor RT0, x2;
110
111#define KL(x0, x1, x2, x3, x4, i) \
112 K(x0, x1, x2, x3, x4, i); \
113 movdqa x0, x4; \
114 psrld $5, x0; \
115 pslld $(32 - 5), x4; \
116 por x4, x0; \
117 movdqa x2, x4; \
118 psrld $22, x2; \
119 pslld $(32 - 22), x4; \
120 por x4, x2; \
121 pxor x3, x2; \
122 pxor x3, x0; \
123 movdqa x1, x4; \
124 pslld $7, x4; \
125 pxor x1, x0; \
126 pxor x4, x2; \
127 movdqa x1, x4; \
128 psrld $1, x1; \
129 pslld $(32 - 1), x4; \
130 por x4, x1; \
131 movdqa x3, x4; \
132 psrld $7, x3; \
133 pslld $(32 - 7), x4; \
134 por x4, x3; \
135 pxor x0, x1; \
136 movdqa x0, x4; \
137 pslld $3, x4; \
138 pxor x4, x3; \
139 movdqa x0, x4; \
140 psrld $13, x0; \
141 pslld $(32 - 13), x4; \
142 por x4, x0; \
143 pxor x2, x1; \
144 pxor x2, x3; \
145 movdqa x2, x4; \
146 psrld $3, x2; \
147 pslld $(32 - 3), x4; \
148 por x4, x2;
149
150#define S0(x0, x1, x2, x3, x4) \
151 movdqa x3, x4; \
152 por x0, x3; \
153 pxor x4, x0; \
154 pxor x2, x4; \
155 pxor RNOT, x4; \
156 pxor x1, x3; \
157 pand x0, x1; \
158 pxor x4, x1; \
159 pxor x0, x2; \
160 pxor x3, x0; \
161 por x0, x4; \
162 pxor x2, x0; \
163 pand x1, x2; \
164 pxor x2, x3; \
165 pxor RNOT, x1; \
166 pxor x4, x2; \
167 pxor x2, x1;
168
169#define S1(x0, x1, x2, x3, x4) \
170 movdqa x1, x4; \
171 pxor x0, x1; \
172 pxor x3, x0; \
173 pxor RNOT, x3; \
174 pand x1, x4; \
175 por x1, x0; \
176 pxor x2, x3; \
177 pxor x3, x0; \
178 pxor x3, x1; \
179 pxor x4, x3; \
180 por x4, x1; \
181 pxor x2, x4; \
182 pand x0, x2; \
183 pxor x1, x2; \
184 por x0, x1; \
185 pxor RNOT, x0; \
186 pxor x2, x0; \
187 pxor x1, x4;
188
189#define S2(x0, x1, x2, x3, x4) \
190 pxor RNOT, x3; \
191 pxor x0, x1; \
192 movdqa x0, x4; \
193 pand x2, x0; \
194 pxor x3, x0; \
195 por x4, x3; \
196 pxor x1, x2; \
197 pxor x1, x3; \
198 pand x0, x1; \
199 pxor x2, x0; \
200 pand x3, x2; \
201 por x1, x3; \
202 pxor RNOT, x0; \
203 pxor x0, x3; \
204 pxor x0, x4; \
205 pxor x2, x0; \
206 por x2, x1;
207
208#define S3(x0, x1, x2, x3, x4) \
209 movdqa x1, x4; \
210 pxor x3, x1; \
211 por x0, x3; \
212 pand x0, x4; \
213 pxor x2, x0; \
214 pxor x1, x2; \
215 pand x3, x1; \
216 pxor x3, x2; \
217 por x4, x0; \
218 pxor x3, x4; \
219 pxor x0, x1; \
220 pand x3, x0; \
221 pand x4, x3; \
222 pxor x2, x3; \
223 por x1, x4; \
224 pand x1, x2; \
225 pxor x3, x4; \
226 pxor x3, x0; \
227 pxor x2, x3;
228
229#define S4(x0, x1, x2, x3, x4) \
230 movdqa x3, x4; \
231 pand x0, x3; \
232 pxor x4, x0; \
233 pxor x2, x3; \
234 por x4, x2; \
235 pxor x1, x0; \
236 pxor x3, x4; \
237 por x0, x2; \
238 pxor x1, x2; \
239 pand x0, x1; \
240 pxor x4, x1; \
241 pand x2, x4; \
242 pxor x3, x2; \
243 pxor x0, x4; \
244 por x1, x3; \
245 pxor RNOT, x1; \
246 pxor x0, x3;
247
248#define S5(x0, x1, x2, x3, x4) \
249 movdqa x1, x4; \
250 por x0, x1; \
251 pxor x1, x2; \
252 pxor RNOT, x3; \
253 pxor x0, x4; \
254 pxor x2, x0; \
255 pand x4, x1; \
256 por x3, x4; \
257 pxor x0, x4; \
258 pand x3, x0; \
259 pxor x3, x1; \
260 pxor x2, x3; \
261 pxor x1, x0; \
262 pand x4, x2; \
263 pxor x2, x1; \
264 pand x0, x2; \
265 pxor x2, x3;
266
267#define S6(x0, x1, x2, x3, x4) \
268 movdqa x1, x4; \
269 pxor x0, x3; \
270 pxor x2, x1; \
271 pxor x0, x2; \
272 pand x3, x0; \
273 por x3, x1; \
274 pxor RNOT, x4; \
275 pxor x1, x0; \
276 pxor x2, x1; \
277 pxor x4, x3; \
278 pxor x0, x4; \
279 pand x0, x2; \
280 pxor x1, x4; \
281 pxor x3, x2; \
282 pand x1, x3; \
283 pxor x0, x3; \
284 pxor x2, x1;
285
286#define S7(x0, x1, x2, x3, x4) \
287 pxor RNOT, x1; \
288 movdqa x1, x4; \
289 pxor RNOT, x0; \
290 pand x2, x1; \
291 pxor x3, x1; \
292 por x4, x3; \
293 pxor x2, x4; \
294 pxor x3, x2; \
295 pxor x0, x3; \
296 por x1, x0; \
297 pand x0, x2; \
298 pxor x4, x0; \
299 pxor x3, x4; \
300 pand x0, x3; \
301 pxor x1, x4; \
302 pxor x4, x2; \
303 pxor x1, x3; \
304 por x0, x4; \
305 pxor x1, x4;
306
307#define SI0(x0, x1, x2, x3, x4) \
308 movdqa x3, x4; \
309 pxor x0, x1; \
310 por x1, x3; \
311 pxor x1, x4; \
312 pxor RNOT, x0; \
313 pxor x3, x2; \
314 pxor x0, x3; \
315 pand x1, x0; \
316 pxor x2, x0; \
317 pand x3, x2; \
318 pxor x4, x3; \
319 pxor x3, x2; \
320 pxor x3, x1; \
321 pand x0, x3; \
322 pxor x0, x1; \
323 pxor x2, x0; \
324 pxor x3, x4;
325
326#define SI1(x0, x1, x2, x3, x4) \
327 pxor x3, x1; \
328 movdqa x0, x4; \
329 pxor x2, x0; \
330 pxor RNOT, x2; \
331 por x1, x4; \
332 pxor x3, x4; \
333 pand x1, x3; \
334 pxor x2, x1; \
335 pand x4, x2; \
336 pxor x1, x4; \
337 por x3, x1; \
338 pxor x0, x3; \
339 pxor x0, x2; \
340 por x4, x0; \
341 pxor x4, x2; \
342 pxor x0, x1; \
343 pxor x1, x4;
344
345#define SI2(x0, x1, x2, x3, x4) \
346 pxor x1, x2; \
347 movdqa x3, x4; \
348 pxor RNOT, x3; \
349 por x2, x3; \
350 pxor x4, x2; \
351 pxor x0, x4; \
352 pxor x1, x3; \
353 por x2, x1; \
354 pxor x0, x2; \
355 pxor x4, x1; \
356 por x3, x4; \
357 pxor x3, x2; \
358 pxor x2, x4; \
359 pand x1, x2; \
360 pxor x3, x2; \
361 pxor x4, x3; \
362 pxor x0, x4;
363
364#define SI3(x0, x1, x2, x3, x4) \
365 pxor x1, x2; \
366 movdqa x1, x4; \
367 pand x2, x1; \
368 pxor x0, x1; \
369 por x4, x0; \
370 pxor x3, x4; \
371 pxor x3, x0; \
372 por x1, x3; \
373 pxor x2, x1; \
374 pxor x3, x1; \
375 pxor x2, x0; \
376 pxor x3, x2; \
377 pand x1, x3; \
378 pxor x0, x1; \
379 pand x2, x0; \
380 pxor x3, x4; \
381 pxor x0, x3; \
382 pxor x1, x0;
383
384#define SI4(x0, x1, x2, x3, x4) \
385 pxor x3, x2; \
386 movdqa x0, x4; \
387 pand x1, x0; \
388 pxor x2, x0; \
389 por x3, x2; \
390 pxor RNOT, x4; \
391 pxor x0, x1; \
392 pxor x2, x0; \
393 pand x4, x2; \
394 pxor x0, x2; \
395 por x4, x0; \
396 pxor x3, x0; \
397 pand x2, x3; \
398 pxor x3, x4; \
399 pxor x1, x3; \
400 pand x0, x1; \
401 pxor x1, x4; \
402 pxor x3, x0;
403
404#define SI5(x0, x1, x2, x3, x4) \
405 movdqa x1, x4; \
406 por x2, x1; \
407 pxor x4, x2; \
408 pxor x3, x1; \
409 pand x4, x3; \
410 pxor x3, x2; \
411 por x0, x3; \
412 pxor RNOT, x0; \
413 pxor x2, x3; \
414 por x0, x2; \
415 pxor x1, x4; \
416 pxor x4, x2; \
417 pand x0, x4; \
418 pxor x1, x0; \
419 pxor x3, x1; \
420 pand x2, x0; \
421 pxor x3, x2; \
422 pxor x2, x0; \
423 pxor x4, x2; \
424 pxor x3, x4;
425
426#define SI6(x0, x1, x2, x3, x4) \
427 pxor x2, x0; \
428 movdqa x0, x4; \
429 pand x3, x0; \
430 pxor x3, x2; \
431 pxor x2, x0; \
432 pxor x1, x3; \
433 por x4, x2; \
434 pxor x3, x2; \
435 pand x0, x3; \
436 pxor RNOT, x0; \
437 pxor x1, x3; \
438 pand x2, x1; \
439 pxor x0, x4; \
440 pxor x4, x3; \
441 pxor x2, x4; \
442 pxor x1, x0; \
443 pxor x0, x2;
444
445#define SI7(x0, x1, x2, x3, x4) \
446 movdqa x3, x4; \
447 pand x0, x3; \
448 pxor x2, x0; \
449 por x4, x2; \
450 pxor x1, x4; \
451 pxor RNOT, x0; \
452 por x3, x1; \
453 pxor x0, x4; \
454 pand x2, x0; \
455 pxor x1, x0; \
456 pand x2, x1; \
457 pxor x2, x3; \
458 pxor x3, x4; \
459 pand x3, x2; \
460 por x0, x3; \
461 pxor x4, x1; \
462 pxor x4, x3; \
463 pand x0, x4; \
464 pxor x2, x4;
465
466#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
467 movdqa x2, t3; \
468 movdqa x0, t1; \
469 unpcklps x3, t3; \
470 movdqa x0, t2; \
471 unpcklps x1, t1; \
472 unpckhps x1, t2; \
473 movdqa t3, x1; \
474 unpckhps x3, x2; \
475 movdqa t1, x0; \
476 movhlps t1, x1; \
477 movdqa t2, t1; \
478 movlhps t3, x0; \
479 movlhps x2, t1; \
480 movhlps t2, x2; \
481 movdqa x2, x3; \
482 movdqa t1, x2;
483
484#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
485 movdqu (0*4*4)(in), x0; \
486 movdqu (1*4*4)(in), x1; \
487 movdqu (2*4*4)(in), x2; \
488 movdqu (3*4*4)(in), x3; \
489 \
490 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
491
492#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
493 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
494 \
495 movdqu x0, (0*4*4)(out); \
496 movdqu x1, (1*4*4)(out); \
497 movdqu x2, (2*4*4)(out); \
498 movdqu x3, (3*4*4)(out);
499
500#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
501 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
502 \
503 movdqu (0*4*4)(out), t0; \
504 pxor t0, x0; \
505 movdqu x0, (0*4*4)(out); \
506 movdqu (1*4*4)(out), t0; \
507 pxor t0, x1; \
508 movdqu x1, (1*4*4)(out); \
509 movdqu (2*4*4)(out), t0; \
510 pxor t0, x2; \
511 movdqu x2, (2*4*4)(out); \
512 movdqu (3*4*4)(out), t0; \
513 pxor t0, x3; \
514 movdqu x3, (3*4*4)(out);
515
516.align 8
517.global __serpent_enc_blk_4way
518.type __serpent_enc_blk_4way,@function;
519
520__serpent_enc_blk_4way:
521 /* input:
522 * arg_ctx(%esp): ctx, CTX
523 * arg_dst(%esp): dst
524 * arg_src(%esp): src
525 * arg_xor(%esp): bool, if true: xor output
526 */
527
528 pcmpeqd RNOT, RNOT;
529
530 movl arg_ctx(%esp), CTX;
531
532 movl arg_src(%esp), %eax;
533 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
534
535 K(RA, RB, RC, RD, RE, 0);
536 S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
537 S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
538 S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
539 S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
540 S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
541 S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
542 S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
543 S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
544 S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
545 S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
546 S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
547 S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
548 S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
549 S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
550 S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
551 S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
552 S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
553 S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
554 S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
555 S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
556 S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
557 S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
558 S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
559 S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
560 S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
561 S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
562 S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
563 S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
564 S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
565 S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
566 S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
567 S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
568
569 movl arg_dst(%esp), %eax;
570
571 cmpb $0, arg_xor(%esp);
572 jnz __enc_xor4;
573
574 write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
575
576 ret;
577
578__enc_xor4:
579 xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
580
581 ret;
582
583.align 8
584.global serpent_dec_blk_4way
585.type serpent_dec_blk_4way,@function;
586
587serpent_dec_blk_4way:
588 /* input:
589 * arg_ctx(%esp): ctx, CTX
590 * arg_dst(%esp): dst
591 * arg_src(%esp): src
592 */
593
594 pcmpeqd RNOT, RNOT;
595
596 movl arg_ctx(%esp), CTX;
597
598 movl arg_src(%esp), %eax;
599 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
600
601 K(RA, RB, RC, RD, RE, 32);
602 SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
603 SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
604 SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
605 SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
606 SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
607 SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
608 SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
609 SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
610 SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
611 SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
612 SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
613 SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
614 SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
615 SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
616 SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
617 SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
618 SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
619 SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
620 SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
621 SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
622 SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
623 SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
624 SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
625 SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
626 SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
627 SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
628 SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
629 SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
630 SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
631 SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
632 SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
633 SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
634
635 movl arg_dst(%esp), %eax;
636 write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
637
638 ret;
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 000000000000..7f24a1540821
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,761 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way SSE2 serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define RA2 %xmm5
42#define RB2 %xmm6
43#define RC2 %xmm7
44#define RD2 %xmm8
45#define RE2 %xmm9
46
47#define RNOT %xmm10
48
49#define RK0 %xmm11
50#define RK1 %xmm12
51#define RK2 %xmm13
52#define RK3 %xmm14
53
54#define S0_1(x0, x1, x2, x3, x4) \
55 movdqa x3, x4; \
56 por x0, x3; \
57 pxor x4, x0; \
58 pxor x2, x4; \
59 pxor RNOT, x4; \
60 pxor x1, x3; \
61 pand x0, x1; \
62 pxor x4, x1; \
63 pxor x0, x2;
64#define S0_2(x0, x1, x2, x3, x4) \
65 pxor x3, x0; \
66 por x0, x4; \
67 pxor x2, x0; \
68 pand x1, x2; \
69 pxor x2, x3; \
70 pxor RNOT, x1; \
71 pxor x4, x2; \
72 pxor x2, x1;
73
74#define S1_1(x0, x1, x2, x3, x4) \
75 movdqa x1, x4; \
76 pxor x0, x1; \
77 pxor x3, x0; \
78 pxor RNOT, x3; \
79 pand x1, x4; \
80 por x1, x0; \
81 pxor x2, x3; \
82 pxor x3, x0; \
83 pxor x3, x1;
84#define S1_2(x0, x1, x2, x3, x4) \
85 pxor x4, x3; \
86 por x4, x1; \
87 pxor x2, x4; \
88 pand x0, x2; \
89 pxor x1, x2; \
90 por x0, x1; \
91 pxor RNOT, x0; \
92 pxor x2, x0; \
93 pxor x1, x4;
94
95#define S2_1(x0, x1, x2, x3, x4) \
96 pxor RNOT, x3; \
97 pxor x0, x1; \
98 movdqa x0, x4; \
99 pand x2, x0; \
100 pxor x3, x0; \
101 por x4, x3; \
102 pxor x1, x2; \
103 pxor x1, x3; \
104 pand x0, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 pxor x2, x0; \
107 pand x3, x2; \
108 por x1, x3; \
109 pxor RNOT, x0; \
110 pxor x0, x3; \
111 pxor x0, x4; \
112 pxor x2, x0; \
113 por x2, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 movdqa x1, x4; \
117 pxor x3, x1; \
118 por x0, x3; \
119 pand x0, x4; \
120 pxor x2, x0; \
121 pxor x1, x2; \
122 pand x3, x1; \
123 pxor x3, x2; \
124 por x4, x0; \
125 pxor x3, x4;
126#define S3_2(x0, x1, x2, x3, x4) \
127 pxor x0, x1; \
128 pand x3, x0; \
129 pand x4, x3; \
130 pxor x2, x3; \
131 por x1, x4; \
132 pand x1, x2; \
133 pxor x3, x4; \
134 pxor x3, x0; \
135 pxor x2, x3;
136
137#define S4_1(x0, x1, x2, x3, x4) \
138 movdqa x3, x4; \
139 pand x0, x3; \
140 pxor x4, x0; \
141 pxor x2, x3; \
142 por x4, x2; \
143 pxor x1, x0; \
144 pxor x3, x4; \
145 por x0, x2; \
146 pxor x1, x2;
147#define S4_2(x0, x1, x2, x3, x4) \
148 pand x0, x1; \
149 pxor x4, x1; \
150 pand x2, x4; \
151 pxor x3, x2; \
152 pxor x0, x4; \
153 por x1, x3; \
154 pxor RNOT, x1; \
155 pxor x0, x3;
156
157#define S5_1(x0, x1, x2, x3, x4) \
158 movdqa x1, x4; \
159 por x0, x1; \
160 pxor x1, x2; \
161 pxor RNOT, x3; \
162 pxor x0, x4; \
163 pxor x2, x0; \
164 pand x4, x1; \
165 por x3, x4; \
166 pxor x0, x4;
167#define S5_2(x0, x1, x2, x3, x4) \
168 pand x3, x0; \
169 pxor x3, x1; \
170 pxor x2, x3; \
171 pxor x1, x0; \
172 pand x4, x2; \
173 pxor x2, x1; \
174 pand x0, x2; \
175 pxor x2, x3;
176
177#define S6_1(x0, x1, x2, x3, x4) \
178 movdqa x1, x4; \
179 pxor x0, x3; \
180 pxor x2, x1; \
181 pxor x0, x2; \
182 pand x3, x0; \
183 por x3, x1; \
184 pxor RNOT, x4; \
185 pxor x1, x0; \
186 pxor x2, x1;
187#define S6_2(x0, x1, x2, x3, x4) \
188 pxor x4, x3; \
189 pxor x0, x4; \
190 pand x0, x2; \
191 pxor x1, x4; \
192 pxor x3, x2; \
193 pand x1, x3; \
194 pxor x0, x3; \
195 pxor x2, x1;
196
197#define S7_1(x0, x1, x2, x3, x4) \
198 pxor RNOT, x1; \
199 movdqa x1, x4; \
200 pxor RNOT, x0; \
201 pand x2, x1; \
202 pxor x3, x1; \
203 por x4, x3; \
204 pxor x2, x4; \
205 pxor x3, x2; \
206 pxor x0, x3; \
207 por x1, x0;
208#define S7_2(x0, x1, x2, x3, x4) \
209 pand x0, x2; \
210 pxor x4, x0; \
211 pxor x3, x4; \
212 pand x0, x3; \
213 pxor x1, x4; \
214 pxor x4, x2; \
215 pxor x1, x3; \
216 por x0, x4; \
217 pxor x1, x4;
218
219#define SI0_1(x0, x1, x2, x3, x4) \
220 movdqa x3, x4; \
221 pxor x0, x1; \
222 por x1, x3; \
223 pxor x1, x4; \
224 pxor RNOT, x0; \
225 pxor x3, x2; \
226 pxor x0, x3; \
227 pand x1, x0; \
228 pxor x2, x0;
229#define SI0_2(x0, x1, x2, x3, x4) \
230 pand x3, x2; \
231 pxor x4, x3; \
232 pxor x3, x2; \
233 pxor x3, x1; \
234 pand x0, x3; \
235 pxor x0, x1; \
236 pxor x2, x0; \
237 pxor x3, x4;
238
239#define SI1_1(x0, x1, x2, x3, x4) \
240 pxor x3, x1; \
241 movdqa x0, x4; \
242 pxor x2, x0; \
243 pxor RNOT, x2; \
244 por x1, x4; \
245 pxor x3, x4; \
246 pand x1, x3; \
247 pxor x2, x1; \
248 pand x4, x2;
249#define SI1_2(x0, x1, x2, x3, x4) \
250 pxor x1, x4; \
251 por x3, x1; \
252 pxor x0, x3; \
253 pxor x0, x2; \
254 por x4, x0; \
255 pxor x4, x2; \
256 pxor x0, x1; \
257 pxor x1, x4;
258
259#define SI2_1(x0, x1, x2, x3, x4) \
260 pxor x1, x2; \
261 movdqa x3, x4; \
262 pxor RNOT, x3; \
263 por x2, x3; \
264 pxor x4, x2; \
265 pxor x0, x4; \
266 pxor x1, x3; \
267 por x2, x1; \
268 pxor x0, x2;
269#define SI2_2(x0, x1, x2, x3, x4) \
270 pxor x4, x1; \
271 por x3, x4; \
272 pxor x3, x2; \
273 pxor x2, x4; \
274 pand x1, x2; \
275 pxor x3, x2; \
276 pxor x4, x3; \
277 pxor x0, x4;
278
279#define SI3_1(x0, x1, x2, x3, x4) \
280 pxor x1, x2; \
281 movdqa x1, x4; \
282 pand x2, x1; \
283 pxor x0, x1; \
284 por x4, x0; \
285 pxor x3, x4; \
286 pxor x3, x0; \
287 por x1, x3; \
288 pxor x2, x1;
289#define SI3_2(x0, x1, x2, x3, x4) \
290 pxor x3, x1; \
291 pxor x2, x0; \
292 pxor x3, x2; \
293 pand x1, x3; \
294 pxor x0, x1; \
295 pand x2, x0; \
296 pxor x3, x4; \
297 pxor x0, x3; \
298 pxor x1, x0;
299
300#define SI4_1(x0, x1, x2, x3, x4) \
301 pxor x3, x2; \
302 movdqa x0, x4; \
303 pand x1, x0; \
304 pxor x2, x0; \
305 por x3, x2; \
306 pxor RNOT, x4; \
307 pxor x0, x1; \
308 pxor x2, x0; \
309 pand x4, x2;
310#define SI4_2(x0, x1, x2, x3, x4) \
311 pxor x0, x2; \
312 por x4, x0; \
313 pxor x3, x0; \
314 pand x2, x3; \
315 pxor x3, x4; \
316 pxor x1, x3; \
317 pand x0, x1; \
318 pxor x1, x4; \
319 pxor x3, x0;
320
321#define SI5_1(x0, x1, x2, x3, x4) \
322 movdqa x1, x4; \
323 por x2, x1; \
324 pxor x4, x2; \
325 pxor x3, x1; \
326 pand x4, x3; \
327 pxor x3, x2; \
328 por x0, x3; \
329 pxor RNOT, x0; \
330 pxor x2, x3; \
331 por x0, x2;
332#define SI5_2(x0, x1, x2, x3, x4) \
333 pxor x1, x4; \
334 pxor x4, x2; \
335 pand x0, x4; \
336 pxor x1, x0; \
337 pxor x3, x1; \
338 pand x2, x0; \
339 pxor x3, x2; \
340 pxor x2, x0; \
341 pxor x4, x2; \
342 pxor x3, x4;
343
344#define SI6_1(x0, x1, x2, x3, x4) \
345 pxor x2, x0; \
346 movdqa x0, x4; \
347 pand x3, x0; \
348 pxor x3, x2; \
349 pxor x2, x0; \
350 pxor x1, x3; \
351 por x4, x2; \
352 pxor x3, x2; \
353 pand x0, x3;
354#define SI6_2(x0, x1, x2, x3, x4) \
355 pxor RNOT, x0; \
356 pxor x1, x3; \
357 pand x2, x1; \
358 pxor x0, x4; \
359 pxor x4, x3; \
360 pxor x2, x4; \
361 pxor x1, x0; \
362 pxor x0, x2;
363
364#define SI7_1(x0, x1, x2, x3, x4) \
365 movdqa x3, x4; \
366 pand x0, x3; \
367 pxor x2, x0; \
368 por x4, x2; \
369 pxor x1, x4; \
370 pxor RNOT, x0; \
371 por x3, x1; \
372 pxor x0, x4; \
373 pand x2, x0; \
374 pxor x1, x0;
375#define SI7_2(x0, x1, x2, x3, x4) \
376 pand x2, x1; \
377 pxor x2, x3; \
378 pxor x3, x4; \
379 pand x3, x2; \
380 por x0, x3; \
381 pxor x4, x1; \
382 pxor x4, x3; \
383 pand x0, x4; \
384 pxor x2, x4;
385
386#define get_key(i, j, t) \
387 movd (4*(i)+(j))*4(CTX), t; \
388 pshufd $0, t, t;
389
390#define K2(x0, x1, x2, x3, x4, i) \
391 get_key(i, 0, RK0); \
392 get_key(i, 1, RK1); \
393 get_key(i, 2, RK2); \
394 get_key(i, 3, RK3); \
395 pxor RK0, x0 ## 1; \
396 pxor RK1, x1 ## 1; \
397 pxor RK2, x2 ## 1; \
398 pxor RK3, x3 ## 1; \
399 pxor RK0, x0 ## 2; \
400 pxor RK1, x1 ## 2; \
401 pxor RK2, x2 ## 2; \
402 pxor RK3, x3 ## 2;
403
404#define LK2(x0, x1, x2, x3, x4, i) \
405 movdqa x0 ## 1, x4 ## 1; \
406 pslld $13, x0 ## 1; \
407 psrld $(32 - 13), x4 ## 1; \
408 por x4 ## 1, x0 ## 1; \
409 pxor x0 ## 1, x1 ## 1; \
410 movdqa x2 ## 1, x4 ## 1; \
411 pslld $3, x2 ## 1; \
412 psrld $(32 - 3), x4 ## 1; \
413 por x4 ## 1, x2 ## 1; \
414 pxor x2 ## 1, x1 ## 1; \
415 movdqa x0 ## 2, x4 ## 2; \
416 pslld $13, x0 ## 2; \
417 psrld $(32 - 13), x4 ## 2; \
418 por x4 ## 2, x0 ## 2; \
419 pxor x0 ## 2, x1 ## 2; \
420 movdqa x2 ## 2, x4 ## 2; \
421 pslld $3, x2 ## 2; \
422 psrld $(32 - 3), x4 ## 2; \
423 por x4 ## 2, x2 ## 2; \
424 pxor x2 ## 2, x1 ## 2; \
425 movdqa x1 ## 1, x4 ## 1; \
426 pslld $1, x1 ## 1; \
427 psrld $(32 - 1), x4 ## 1; \
428 por x4 ## 1, x1 ## 1; \
429 movdqa x0 ## 1, x4 ## 1; \
430 pslld $3, x4 ## 1; \
431 pxor x2 ## 1, x3 ## 1; \
432 pxor x4 ## 1, x3 ## 1; \
433 movdqa x3 ## 1, x4 ## 1; \
434 get_key(i, 1, RK1); \
435 movdqa x1 ## 2, x4 ## 2; \
436 pslld $1, x1 ## 2; \
437 psrld $(32 - 1), x4 ## 2; \
438 por x4 ## 2, x1 ## 2; \
439 movdqa x0 ## 2, x4 ## 2; \
440 pslld $3, x4 ## 2; \
441 pxor x2 ## 2, x3 ## 2; \
442 pxor x4 ## 2, x3 ## 2; \
443 movdqa x3 ## 2, x4 ## 2; \
444 get_key(i, 3, RK3); \
445 pslld $7, x3 ## 1; \
446 psrld $(32 - 7), x4 ## 1; \
447 por x4 ## 1, x3 ## 1; \
448 movdqa x1 ## 1, x4 ## 1; \
449 pslld $7, x4 ## 1; \
450 pxor x1 ## 1, x0 ## 1; \
451 pxor x3 ## 1, x0 ## 1; \
452 pxor x3 ## 1, x2 ## 1; \
453 pxor x4 ## 1, x2 ## 1; \
454 get_key(i, 0, RK0); \
455 pslld $7, x3 ## 2; \
456 psrld $(32 - 7), x4 ## 2; \
457 por x4 ## 2, x3 ## 2; \
458 movdqa x1 ## 2, x4 ## 2; \
459 pslld $7, x4 ## 2; \
460 pxor x1 ## 2, x0 ## 2; \
461 pxor x3 ## 2, x0 ## 2; \
462 pxor x3 ## 2, x2 ## 2; \
463 pxor x4 ## 2, x2 ## 2; \
464 get_key(i, 2, RK2); \
465 pxor RK1, x1 ## 1; \
466 pxor RK3, x3 ## 1; \
467 movdqa x0 ## 1, x4 ## 1; \
468 pslld $5, x0 ## 1; \
469 psrld $(32 - 5), x4 ## 1; \
470 por x4 ## 1, x0 ## 1; \
471 movdqa x2 ## 1, x4 ## 1; \
472 pslld $22, x2 ## 1; \
473 psrld $(32 - 22), x4 ## 1; \
474 por x4 ## 1, x2 ## 1; \
475 pxor RK0, x0 ## 1; \
476 pxor RK2, x2 ## 1; \
477 pxor RK1, x1 ## 2; \
478 pxor RK3, x3 ## 2; \
479 movdqa x0 ## 2, x4 ## 2; \
480 pslld $5, x0 ## 2; \
481 psrld $(32 - 5), x4 ## 2; \
482 por x4 ## 2, x0 ## 2; \
483 movdqa x2 ## 2, x4 ## 2; \
484 pslld $22, x2 ## 2; \
485 psrld $(32 - 22), x4 ## 2; \
486 por x4 ## 2, x2 ## 2; \
487 pxor RK0, x0 ## 2; \
488 pxor RK2, x2 ## 2;
489
490#define KL2(x0, x1, x2, x3, x4, i) \
491 pxor RK0, x0 ## 1; \
492 pxor RK2, x2 ## 1; \
493 movdqa x0 ## 1, x4 ## 1; \
494 psrld $5, x0 ## 1; \
495 pslld $(32 - 5), x4 ## 1; \
496 por x4 ## 1, x0 ## 1; \
497 pxor RK3, x3 ## 1; \
498 pxor RK1, x1 ## 1; \
499 movdqa x2 ## 1, x4 ## 1; \
500 psrld $22, x2 ## 1; \
501 pslld $(32 - 22), x4 ## 1; \
502 por x4 ## 1, x2 ## 1; \
503 pxor x3 ## 1, x2 ## 1; \
504 pxor RK0, x0 ## 2; \
505 pxor RK2, x2 ## 2; \
506 movdqa x0 ## 2, x4 ## 2; \
507 psrld $5, x0 ## 2; \
508 pslld $(32 - 5), x4 ## 2; \
509 por x4 ## 2, x0 ## 2; \
510 pxor RK3, x3 ## 2; \
511 pxor RK1, x1 ## 2; \
512 movdqa x2 ## 2, x4 ## 2; \
513 psrld $22, x2 ## 2; \
514 pslld $(32 - 22), x4 ## 2; \
515 por x4 ## 2, x2 ## 2; \
516 pxor x3 ## 2, x2 ## 2; \
517 pxor x3 ## 1, x0 ## 1; \
518 movdqa x1 ## 1, x4 ## 1; \
519 pslld $7, x4 ## 1; \
520 pxor x1 ## 1, x0 ## 1; \
521 pxor x4 ## 1, x2 ## 1; \
522 movdqa x1 ## 1, x4 ## 1; \
523 psrld $1, x1 ## 1; \
524 pslld $(32 - 1), x4 ## 1; \
525 por x4 ## 1, x1 ## 1; \
526 pxor x3 ## 2, x0 ## 2; \
527 movdqa x1 ## 2, x4 ## 2; \
528 pslld $7, x4 ## 2; \
529 pxor x1 ## 2, x0 ## 2; \
530 pxor x4 ## 2, x2 ## 2; \
531 movdqa x1 ## 2, x4 ## 2; \
532 psrld $1, x1 ## 2; \
533 pslld $(32 - 1), x4 ## 2; \
534 por x4 ## 2, x1 ## 2; \
535 movdqa x3 ## 1, x4 ## 1; \
536 psrld $7, x3 ## 1; \
537 pslld $(32 - 7), x4 ## 1; \
538 por x4 ## 1, x3 ## 1; \
539 pxor x0 ## 1, x1 ## 1; \
540 movdqa x0 ## 1, x4 ## 1; \
541 pslld $3, x4 ## 1; \
542 pxor x4 ## 1, x3 ## 1; \
543 movdqa x0 ## 1, x4 ## 1; \
544 movdqa x3 ## 2, x4 ## 2; \
545 psrld $7, x3 ## 2; \
546 pslld $(32 - 7), x4 ## 2; \
547 por x4 ## 2, x3 ## 2; \
548 pxor x0 ## 2, x1 ## 2; \
549 movdqa x0 ## 2, x4 ## 2; \
550 pslld $3, x4 ## 2; \
551 pxor x4 ## 2, x3 ## 2; \
552 movdqa x0 ## 2, x4 ## 2; \
553 psrld $13, x0 ## 1; \
554 pslld $(32 - 13), x4 ## 1; \
555 por x4 ## 1, x0 ## 1; \
556 pxor x2 ## 1, x1 ## 1; \
557 pxor x2 ## 1, x3 ## 1; \
558 movdqa x2 ## 1, x4 ## 1; \
559 psrld $3, x2 ## 1; \
560 pslld $(32 - 3), x4 ## 1; \
561 por x4 ## 1, x2 ## 1; \
562 psrld $13, x0 ## 2; \
563 pslld $(32 - 13), x4 ## 2; \
564 por x4 ## 2, x0 ## 2; \
565 pxor x2 ## 2, x1 ## 2; \
566 pxor x2 ## 2, x3 ## 2; \
567 movdqa x2 ## 2, x4 ## 2; \
568 psrld $3, x2 ## 2; \
569 pslld $(32 - 3), x4 ## 2; \
570 por x4 ## 2, x2 ## 2;
571
572#define S(SBOX, x0, x1, x2, x3, x4) \
573 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
574 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
575 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
576 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
577
578#define SP(SBOX, x0, x1, x2, x3, x4, i) \
579 get_key(i, 0, RK0); \
580 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
581 get_key(i, 2, RK2); \
582 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
583 get_key(i, 3, RK3); \
584 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
585 get_key(i, 1, RK1); \
586 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
587
588#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
589 movdqa x2, t3; \
590 movdqa x0, t1; \
591 unpcklps x3, t3; \
592 movdqa x0, t2; \
593 unpcklps x1, t1; \
594 unpckhps x1, t2; \
595 movdqa t3, x1; \
596 unpckhps x3, x2; \
597 movdqa t1, x0; \
598 movhlps t1, x1; \
599 movdqa t2, t1; \
600 movlhps t3, x0; \
601 movlhps x2, t1; \
602 movhlps t2, x2; \
603 movdqa x2, x3; \
604 movdqa t1, x2;
605
606#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
607 movdqu (0*4*4)(in), x0; \
608 movdqu (1*4*4)(in), x1; \
609 movdqu (2*4*4)(in), x2; \
610 movdqu (3*4*4)(in), x3; \
611 \
612 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
613
614#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
615 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
616 \
617 movdqu x0, (0*4*4)(out); \
618 movdqu x1, (1*4*4)(out); \
619 movdqu x2, (2*4*4)(out); \
620 movdqu x3, (3*4*4)(out);
621
622#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
623 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
624 \
625 movdqu (0*4*4)(out), t0; \
626 pxor t0, x0; \
627 movdqu x0, (0*4*4)(out); \
628 movdqu (1*4*4)(out), t0; \
629 pxor t0, x1; \
630 movdqu x1, (1*4*4)(out); \
631 movdqu (2*4*4)(out), t0; \
632 pxor t0, x2; \
633 movdqu x2, (2*4*4)(out); \
634 movdqu (3*4*4)(out), t0; \
635 pxor t0, x3; \
636 movdqu x3, (3*4*4)(out);
637
638.align 8
639.global __serpent_enc_blk_8way
640.type __serpent_enc_blk_8way,@function;
641
642__serpent_enc_blk_8way:
643 /* input:
644 * %rdi: ctx, CTX
645 * %rsi: dst
646 * %rdx: src
647 * %rcx: bool, if true: xor output
648 */
649
650 pcmpeqd RNOT, RNOT;
651
652 leaq (4*4*4)(%rdx), %rax;
653 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
654 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
655
656 K2(RA, RB, RC, RD, RE, 0);
657 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
658 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
659 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
660 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
661 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
662 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
663 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
664 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
665 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
666 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
667 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
668 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
669 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
670 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
671 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
672 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
673 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
674 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
675 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
676 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
677 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
678 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
679 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
680 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
681 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
682 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
683 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
684 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
685 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
686 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
687 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
688 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
689
690 leaq (4*4*4)(%rsi), %rax;
691
692 testb %cl, %cl;
693 jnz __enc_xor8;
694
695 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
696 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
697
698 ret;
699
700__enc_xor8:
701 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
702 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
703
704 ret;
705
706.align 8
707.global serpent_dec_blk_8way
708.type serpent_dec_blk_8way,@function;
709
710serpent_dec_blk_8way:
711 /* input:
712 * %rdi: ctx, CTX
713 * %rsi: dst
714 * %rdx: src
715 */
716
717 pcmpeqd RNOT, RNOT;
718
719 leaq (4*4*4)(%rdx), %rax;
720 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
721 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
722
723 K2(RA, RB, RC, RD, RE, 32);
724 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
725 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
726 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
727 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
728 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
729 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
730 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
731 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
732 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
733 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
734 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
735 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
736 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
737 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
738 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
739 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
740 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
741 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
742 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
743 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
744 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
745 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
746 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
747 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
748 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
749 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
750 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
751 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
752 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
753 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
754 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
755 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
756
757 leaq (4*4*4)(%rsi), %rax;
758 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
759 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
760
761 ret;
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 000000000000..7955a9b76b91
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,1070 @@
1/*
2 * Glue Code for SSE2 assembler versions of Serpent Cipher
3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Glue code based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
11 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
12 * CTR part based on code (crypto/ctr.c) by:
13 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
28 * USA
29 *
30 */
31
32#include <linux/module.h>
33#include <linux/hardirq.h>
34#include <linux/types.h>
35#include <linux/crypto.h>
36#include <linux/err.h>
37#include <crypto/algapi.h>
38#include <crypto/serpent.h>
39#include <crypto/cryptd.h>
40#include <crypto/b128ops.h>
41#include <crypto/ctr.h>
42#include <crypto/lrw.h>
43#include <crypto/xts.h>
44#include <asm/i387.h>
45#include <asm/serpent.h>
46#include <crypto/scatterwalk.h>
47#include <linux/workqueue.h>
48#include <linux/spinlock.h>
49
50struct async_serpent_ctx {
51 struct cryptd_ablkcipher *cryptd_tfm;
52};
53
54static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
55{
56 if (fpu_enabled)
57 return true;
58
59 /* SSE2 is only used when chunk to be processed is large enough, so
60 * do not enable FPU until it is necessary.
61 */
62 if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
63 return false;
64
65 kernel_fpu_begin();
66 return true;
67}
68
69static inline void serpent_fpu_end(bool fpu_enabled)
70{
71 if (fpu_enabled)
72 kernel_fpu_end();
73}
74
75static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
76 bool enc)
77{
78 bool fpu_enabled = false;
79 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = SERPENT_BLOCK_SIZE;
81 unsigned int nbytes;
82 int err;
83
84 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86
87 while ((nbytes = walk->nbytes)) {
88 u8 *wsrc = walk->src.virt.addr;
89 u8 *wdst = walk->dst.virt.addr;
90
91 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
92
93 /* Process multi-block batch */
94 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
95 do {
96 if (enc)
97 serpent_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 serpent_dec_blk_xway(ctx, wdst, wsrc);
100
101 wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
102 wdst += bsize * SERPENT_PARALLEL_BLOCKS;
103 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
104 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
105
106 if (nbytes < bsize)
107 goto done;
108 }
109
110 /* Handle leftovers */
111 do {
112 if (enc)
113 __serpent_encrypt(ctx, wdst, wsrc);
114 else
115 __serpent_decrypt(ctx, wdst, wsrc);
116
117 wsrc += bsize;
118 wdst += bsize;
119 nbytes -= bsize;
120 } while (nbytes >= bsize);
121
122done:
123 err = blkcipher_walk_done(desc, walk, nbytes);
124 }
125
126 serpent_fpu_end(fpu_enabled);
127 return err;
128}
129
130static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
131 struct scatterlist *src, unsigned int nbytes)
132{
133 struct blkcipher_walk walk;
134
135 blkcipher_walk_init(&walk, dst, src, nbytes);
136 return ecb_crypt(desc, &walk, true);
137}
138
139static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
140 struct scatterlist *src, unsigned int nbytes)
141{
142 struct blkcipher_walk walk;
143
144 blkcipher_walk_init(&walk, dst, src, nbytes);
145 return ecb_crypt(desc, &walk, false);
146}
147
148static struct crypto_alg blk_ecb_alg = {
149 .cra_name = "__ecb-serpent-sse2",
150 .cra_driver_name = "__driver-ecb-serpent-sse2",
151 .cra_priority = 0,
152 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
153 .cra_blocksize = SERPENT_BLOCK_SIZE,
154 .cra_ctxsize = sizeof(struct serpent_ctx),
155 .cra_alignmask = 0,
156 .cra_type = &crypto_blkcipher_type,
157 .cra_module = THIS_MODULE,
158 .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
159 .cra_u = {
160 .blkcipher = {
161 .min_keysize = SERPENT_MIN_KEY_SIZE,
162 .max_keysize = SERPENT_MAX_KEY_SIZE,
163 .setkey = serpent_setkey,
164 .encrypt = ecb_encrypt,
165 .decrypt = ecb_decrypt,
166 },
167 },
168};
169
170static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
171 struct blkcipher_walk *walk)
172{
173 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
174 const unsigned int bsize = SERPENT_BLOCK_SIZE;
175 unsigned int nbytes = walk->nbytes;
176 u128 *src = (u128 *)walk->src.virt.addr;
177 u128 *dst = (u128 *)walk->dst.virt.addr;
178 u128 *iv = (u128 *)walk->iv;
179
180 do {
181 u128_xor(dst, src, iv);
182 __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
183 iv = dst;
184
185 src += 1;
186 dst += 1;
187 nbytes -= bsize;
188 } while (nbytes >= bsize);
189
190 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
191 return nbytes;
192}
193
194static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
195 struct scatterlist *src, unsigned int nbytes)
196{
197 struct blkcipher_walk walk;
198 int err;
199
200 blkcipher_walk_init(&walk, dst, src, nbytes);
201 err = blkcipher_walk_virt(desc, &walk);
202
203 while ((nbytes = walk.nbytes)) {
204 nbytes = __cbc_encrypt(desc, &walk);
205 err = blkcipher_walk_done(desc, &walk, nbytes);
206 }
207
208 return err;
209}
210
211static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
212 struct blkcipher_walk *walk)
213{
214 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
215 const unsigned int bsize = SERPENT_BLOCK_SIZE;
216 unsigned int nbytes = walk->nbytes;
217 u128 *src = (u128 *)walk->src.virt.addr;
218 u128 *dst = (u128 *)walk->dst.virt.addr;
219 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
220 u128 last_iv;
221 int i;
222
223 /* Start of the last block. */
224 src += nbytes / bsize - 1;
225 dst += nbytes / bsize - 1;
226
227 last_iv = *src;
228
229 /* Process multi-block batch */
230 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
231 do {
232 nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
233 src -= SERPENT_PARALLEL_BLOCKS - 1;
234 dst -= SERPENT_PARALLEL_BLOCKS - 1;
235
236 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
237 ivs[i] = src[i];
238
239 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
240
241 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
242 u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
243
244 nbytes -= bsize;
245 if (nbytes < bsize)
246 goto done;
247
248 u128_xor(dst, dst, src - 1);
249 src -= 1;
250 dst -= 1;
251 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
252
253 if (nbytes < bsize)
254 goto done;
255 }
256
257 /* Handle leftovers */
258 for (;;) {
259 __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
260
261 nbytes -= bsize;
262 if (nbytes < bsize)
263 break;
264
265 u128_xor(dst, dst, src - 1);
266 src -= 1;
267 dst -= 1;
268 }
269
270done:
271 u128_xor(dst, dst, (u128 *)walk->iv);
272 *(u128 *)walk->iv = last_iv;
273
274 return nbytes;
275}
276
277static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
278 struct scatterlist *src, unsigned int nbytes)
279{
280 bool fpu_enabled = false;
281 struct blkcipher_walk walk;
282 int err;
283
284 blkcipher_walk_init(&walk, dst, src, nbytes);
285 err = blkcipher_walk_virt(desc, &walk);
286 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
287
288 while ((nbytes = walk.nbytes)) {
289 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
290 nbytes = __cbc_decrypt(desc, &walk);
291 err = blkcipher_walk_done(desc, &walk, nbytes);
292 }
293
294 serpent_fpu_end(fpu_enabled);
295 return err;
296}
297
298static struct crypto_alg blk_cbc_alg = {
299 .cra_name = "__cbc-serpent-sse2",
300 .cra_driver_name = "__driver-cbc-serpent-sse2",
301 .cra_priority = 0,
302 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
303 .cra_blocksize = SERPENT_BLOCK_SIZE,
304 .cra_ctxsize = sizeof(struct serpent_ctx),
305 .cra_alignmask = 0,
306 .cra_type = &crypto_blkcipher_type,
307 .cra_module = THIS_MODULE,
308 .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
309 .cra_u = {
310 .blkcipher = {
311 .min_keysize = SERPENT_MIN_KEY_SIZE,
312 .max_keysize = SERPENT_MAX_KEY_SIZE,
313 .setkey = serpent_setkey,
314 .encrypt = cbc_encrypt,
315 .decrypt = cbc_decrypt,
316 },
317 },
318};
319
320static inline void u128_to_be128(be128 *dst, const u128 *src)
321{
322 dst->a = cpu_to_be64(src->a);
323 dst->b = cpu_to_be64(src->b);
324}
325
326static inline void be128_to_u128(u128 *dst, const be128 *src)
327{
328 dst->a = be64_to_cpu(src->a);
329 dst->b = be64_to_cpu(src->b);
330}
331
332static inline void u128_inc(u128 *i)
333{
334 i->b++;
335 if (!i->b)
336 i->a++;
337}
338
339static void ctr_crypt_final(struct blkcipher_desc *desc,
340 struct blkcipher_walk *walk)
341{
342 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
343 u8 *ctrblk = walk->iv;
344 u8 keystream[SERPENT_BLOCK_SIZE];
345 u8 *src = walk->src.virt.addr;
346 u8 *dst = walk->dst.virt.addr;
347 unsigned int nbytes = walk->nbytes;
348
349 __serpent_encrypt(ctx, keystream, ctrblk);
350 crypto_xor(keystream, src, nbytes);
351 memcpy(dst, keystream, nbytes);
352
353 crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
354}
355
356static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
357 struct blkcipher_walk *walk)
358{
359 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
360 const unsigned int bsize = SERPENT_BLOCK_SIZE;
361 unsigned int nbytes = walk->nbytes;
362 u128 *src = (u128 *)walk->src.virt.addr;
363 u128 *dst = (u128 *)walk->dst.virt.addr;
364 u128 ctrblk;
365 be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
366 int i;
367
368 be128_to_u128(&ctrblk, (be128 *)walk->iv);
369
370 /* Process multi-block batch */
371 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
372 do {
373 /* create ctrblks for parallel encrypt */
374 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
375 if (dst != src)
376 dst[i] = src[i];
377
378 u128_to_be128(&ctrblocks[i], &ctrblk);
379 u128_inc(&ctrblk);
380 }
381
382 serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
383 (u8 *)ctrblocks);
384
385 src += SERPENT_PARALLEL_BLOCKS;
386 dst += SERPENT_PARALLEL_BLOCKS;
387 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
388 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
389
390 if (nbytes < bsize)
391 goto done;
392 }
393
394 /* Handle leftovers */
395 do {
396 if (dst != src)
397 *dst = *src;
398
399 u128_to_be128(&ctrblocks[0], &ctrblk);
400 u128_inc(&ctrblk);
401
402 __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
403 u128_xor(dst, dst, (u128 *)ctrblocks);
404
405 src += 1;
406 dst += 1;
407 nbytes -= bsize;
408 } while (nbytes >= bsize);
409
410done:
411 u128_to_be128((be128 *)walk->iv, &ctrblk);
412 return nbytes;
413}
414
415static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
416 struct scatterlist *src, unsigned int nbytes)
417{
418 bool fpu_enabled = false;
419 struct blkcipher_walk walk;
420 int err;
421
422 blkcipher_walk_init(&walk, dst, src, nbytes);
423 err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
424 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
425
426 while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
427 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
428 nbytes = __ctr_crypt(desc, &walk);
429 err = blkcipher_walk_done(desc, &walk, nbytes);
430 }
431
432 serpent_fpu_end(fpu_enabled);
433
434 if (walk.nbytes) {
435 ctr_crypt_final(desc, &walk);
436 err = blkcipher_walk_done(desc, &walk, 0);
437 }
438
439 return err;
440}
441
442static struct crypto_alg blk_ctr_alg = {
443 .cra_name = "__ctr-serpent-sse2",
444 .cra_driver_name = "__driver-ctr-serpent-sse2",
445 .cra_priority = 0,
446 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
447 .cra_blocksize = 1,
448 .cra_ctxsize = sizeof(struct serpent_ctx),
449 .cra_alignmask = 0,
450 .cra_type = &crypto_blkcipher_type,
451 .cra_module = THIS_MODULE,
452 .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
453 .cra_u = {
454 .blkcipher = {
455 .min_keysize = SERPENT_MIN_KEY_SIZE,
456 .max_keysize = SERPENT_MAX_KEY_SIZE,
457 .ivsize = SERPENT_BLOCK_SIZE,
458 .setkey = serpent_setkey,
459 .encrypt = ctr_crypt,
460 .decrypt = ctr_crypt,
461 },
462 },
463};
464
465struct crypt_priv {
466 struct serpent_ctx *ctx;
467 bool fpu_enabled;
468};
469
470static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
471{
472 const unsigned int bsize = SERPENT_BLOCK_SIZE;
473 struct crypt_priv *ctx = priv;
474 int i;
475
476 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
477
478 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
479 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
480 return;
481 }
482
483 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
484 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
485}
486
487static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
488{
489 const unsigned int bsize = SERPENT_BLOCK_SIZE;
490 struct crypt_priv *ctx = priv;
491 int i;
492
493 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
494
495 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
496 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
497 return;
498 }
499
500 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
501 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
502}
503
504struct serpent_lrw_ctx {
505 struct lrw_table_ctx lrw_table;
506 struct serpent_ctx serpent_ctx;
507};
508
509static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
510 unsigned int keylen)
511{
512 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
513 int err;
514
515 err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
516 SERPENT_BLOCK_SIZE);
517 if (err)
518 return err;
519
520 return lrw_init_table(&ctx->lrw_table, key + keylen -
521 SERPENT_BLOCK_SIZE);
522}
523
524static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
525 struct scatterlist *src, unsigned int nbytes)
526{
527 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
528 be128 buf[SERPENT_PARALLEL_BLOCKS];
529 struct crypt_priv crypt_ctx = {
530 .ctx = &ctx->serpent_ctx,
531 .fpu_enabled = false,
532 };
533 struct lrw_crypt_req req = {
534 .tbuf = buf,
535 .tbuflen = sizeof(buf),
536
537 .table_ctx = &ctx->lrw_table,
538 .crypt_ctx = &crypt_ctx,
539 .crypt_fn = encrypt_callback,
540 };
541 int ret;
542
543 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
544 ret = lrw_crypt(desc, dst, src, nbytes, &req);
545 serpent_fpu_end(crypt_ctx.fpu_enabled);
546
547 return ret;
548}
549
550static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
551 struct scatterlist *src, unsigned int nbytes)
552{
553 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
554 be128 buf[SERPENT_PARALLEL_BLOCKS];
555 struct crypt_priv crypt_ctx = {
556 .ctx = &ctx->serpent_ctx,
557 .fpu_enabled = false,
558 };
559 struct lrw_crypt_req req = {
560 .tbuf = buf,
561 .tbuflen = sizeof(buf),
562
563 .table_ctx = &ctx->lrw_table,
564 .crypt_ctx = &crypt_ctx,
565 .crypt_fn = decrypt_callback,
566 };
567 int ret;
568
569 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
570 ret = lrw_crypt(desc, dst, src, nbytes, &req);
571 serpent_fpu_end(crypt_ctx.fpu_enabled);
572
573 return ret;
574}
575
576static void lrw_exit_tfm(struct crypto_tfm *tfm)
577{
578 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
579
580 lrw_free_table(&ctx->lrw_table);
581}
582
583static struct crypto_alg blk_lrw_alg = {
584 .cra_name = "__lrw-serpent-sse2",
585 .cra_driver_name = "__driver-lrw-serpent-sse2",
586 .cra_priority = 0,
587 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
588 .cra_blocksize = SERPENT_BLOCK_SIZE,
589 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
590 .cra_alignmask = 0,
591 .cra_type = &crypto_blkcipher_type,
592 .cra_module = THIS_MODULE,
593 .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list),
594 .cra_exit = lrw_exit_tfm,
595 .cra_u = {
596 .blkcipher = {
597 .min_keysize = SERPENT_MIN_KEY_SIZE +
598 SERPENT_BLOCK_SIZE,
599 .max_keysize = SERPENT_MAX_KEY_SIZE +
600 SERPENT_BLOCK_SIZE,
601 .ivsize = SERPENT_BLOCK_SIZE,
602 .setkey = lrw_serpent_setkey,
603 .encrypt = lrw_encrypt,
604 .decrypt = lrw_decrypt,
605 },
606 },
607};
608
609struct serpent_xts_ctx {
610 struct serpent_ctx tweak_ctx;
611 struct serpent_ctx crypt_ctx;
612};
613
614static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
615 unsigned int keylen)
616{
617 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
618 u32 *flags = &tfm->crt_flags;
619 int err;
620
621 /* key consists of keys of equal size concatenated, therefore
622 * the length must be even
623 */
624 if (keylen % 2) {
625 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
626 return -EINVAL;
627 }
628
629 /* first half of xts-key is for crypt */
630 err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
631 if (err)
632 return err;
633
634 /* second half of xts-key is for tweak */
635 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
636}
637
638static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
639 struct scatterlist *src, unsigned int nbytes)
640{
641 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
642 be128 buf[SERPENT_PARALLEL_BLOCKS];
643 struct crypt_priv crypt_ctx = {
644 .ctx = &ctx->crypt_ctx,
645 .fpu_enabled = false,
646 };
647 struct xts_crypt_req req = {
648 .tbuf = buf,
649 .tbuflen = sizeof(buf),
650
651 .tweak_ctx = &ctx->tweak_ctx,
652 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
653 .crypt_ctx = &crypt_ctx,
654 .crypt_fn = encrypt_callback,
655 };
656 int ret;
657
658 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
659 ret = xts_crypt(desc, dst, src, nbytes, &req);
660 serpent_fpu_end(crypt_ctx.fpu_enabled);
661
662 return ret;
663}
664
665static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
666 struct scatterlist *src, unsigned int nbytes)
667{
668 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
669 be128 buf[SERPENT_PARALLEL_BLOCKS];
670 struct crypt_priv crypt_ctx = {
671 .ctx = &ctx->crypt_ctx,
672 .fpu_enabled = false,
673 };
674 struct xts_crypt_req req = {
675 .tbuf = buf,
676 .tbuflen = sizeof(buf),
677
678 .tweak_ctx = &ctx->tweak_ctx,
679 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
680 .crypt_ctx = &crypt_ctx,
681 .crypt_fn = decrypt_callback,
682 };
683 int ret;
684
685 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
686 ret = xts_crypt(desc, dst, src, nbytes, &req);
687 serpent_fpu_end(crypt_ctx.fpu_enabled);
688
689 return ret;
690}
691
692static struct crypto_alg blk_xts_alg = {
693 .cra_name = "__xts-serpent-sse2",
694 .cra_driver_name = "__driver-xts-serpent-sse2",
695 .cra_priority = 0,
696 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
697 .cra_blocksize = SERPENT_BLOCK_SIZE,
698 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
699 .cra_alignmask = 0,
700 .cra_type = &crypto_blkcipher_type,
701 .cra_module = THIS_MODULE,
702 .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list),
703 .cra_u = {
704 .blkcipher = {
705 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
706 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
707 .ivsize = SERPENT_BLOCK_SIZE,
708 .setkey = xts_serpent_setkey,
709 .encrypt = xts_encrypt,
710 .decrypt = xts_decrypt,
711 },
712 },
713};
714
715static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
716 unsigned int key_len)
717{
718 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
719 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
720 int err;
721
722 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
723 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
724 & CRYPTO_TFM_REQ_MASK);
725 err = crypto_ablkcipher_setkey(child, key, key_len);
726 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
727 & CRYPTO_TFM_RES_MASK);
728 return err;
729}
730
731static int __ablk_encrypt(struct ablkcipher_request *req)
732{
733 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
734 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
735 struct blkcipher_desc desc;
736
737 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
738 desc.info = req->info;
739 desc.flags = 0;
740
741 return crypto_blkcipher_crt(desc.tfm)->encrypt(
742 &desc, req->dst, req->src, req->nbytes);
743}
744
745static int ablk_encrypt(struct ablkcipher_request *req)
746{
747 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
748 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
749
750 if (!irq_fpu_usable()) {
751 struct ablkcipher_request *cryptd_req =
752 ablkcipher_request_ctx(req);
753
754 memcpy(cryptd_req, req, sizeof(*req));
755 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
756
757 return crypto_ablkcipher_encrypt(cryptd_req);
758 } else {
759 return __ablk_encrypt(req);
760 }
761}
762
763static int ablk_decrypt(struct ablkcipher_request *req)
764{
765 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
766 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
767
768 if (!irq_fpu_usable()) {
769 struct ablkcipher_request *cryptd_req =
770 ablkcipher_request_ctx(req);
771
772 memcpy(cryptd_req, req, sizeof(*req));
773 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
774
775 return crypto_ablkcipher_decrypt(cryptd_req);
776 } else {
777 struct blkcipher_desc desc;
778
779 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
780 desc.info = req->info;
781 desc.flags = 0;
782
783 return crypto_blkcipher_crt(desc.tfm)->decrypt(
784 &desc, req->dst, req->src, req->nbytes);
785 }
786}
787
788static void ablk_exit(struct crypto_tfm *tfm)
789{
790 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
791
792 cryptd_free_ablkcipher(ctx->cryptd_tfm);
793}
794
795static void ablk_init_common(struct crypto_tfm *tfm,
796 struct cryptd_ablkcipher *cryptd_tfm)
797{
798 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
799
800 ctx->cryptd_tfm = cryptd_tfm;
801 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
802 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
803}
804
805static int ablk_ecb_init(struct crypto_tfm *tfm)
806{
807 struct cryptd_ablkcipher *cryptd_tfm;
808
809 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
810 if (IS_ERR(cryptd_tfm))
811 return PTR_ERR(cryptd_tfm);
812 ablk_init_common(tfm, cryptd_tfm);
813 return 0;
814}
815
816static struct crypto_alg ablk_ecb_alg = {
817 .cra_name = "ecb(serpent)",
818 .cra_driver_name = "ecb-serpent-sse2",
819 .cra_priority = 400,
820 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
821 .cra_blocksize = SERPENT_BLOCK_SIZE,
822 .cra_ctxsize = sizeof(struct async_serpent_ctx),
823 .cra_alignmask = 0,
824 .cra_type = &crypto_ablkcipher_type,
825 .cra_module = THIS_MODULE,
826 .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
827 .cra_init = ablk_ecb_init,
828 .cra_exit = ablk_exit,
829 .cra_u = {
830 .ablkcipher = {
831 .min_keysize = SERPENT_MIN_KEY_SIZE,
832 .max_keysize = SERPENT_MAX_KEY_SIZE,
833 .setkey = ablk_set_key,
834 .encrypt = ablk_encrypt,
835 .decrypt = ablk_decrypt,
836 },
837 },
838};
839
840static int ablk_cbc_init(struct crypto_tfm *tfm)
841{
842 struct cryptd_ablkcipher *cryptd_tfm;
843
844 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
845 if (IS_ERR(cryptd_tfm))
846 return PTR_ERR(cryptd_tfm);
847 ablk_init_common(tfm, cryptd_tfm);
848 return 0;
849}
850
851static struct crypto_alg ablk_cbc_alg = {
852 .cra_name = "cbc(serpent)",
853 .cra_driver_name = "cbc-serpent-sse2",
854 .cra_priority = 400,
855 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
856 .cra_blocksize = SERPENT_BLOCK_SIZE,
857 .cra_ctxsize = sizeof(struct async_serpent_ctx),
858 .cra_alignmask = 0,
859 .cra_type = &crypto_ablkcipher_type,
860 .cra_module = THIS_MODULE,
861 .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
862 .cra_init = ablk_cbc_init,
863 .cra_exit = ablk_exit,
864 .cra_u = {
865 .ablkcipher = {
866 .min_keysize = SERPENT_MIN_KEY_SIZE,
867 .max_keysize = SERPENT_MAX_KEY_SIZE,
868 .ivsize = SERPENT_BLOCK_SIZE,
869 .setkey = ablk_set_key,
870 .encrypt = __ablk_encrypt,
871 .decrypt = ablk_decrypt,
872 },
873 },
874};
875
876static int ablk_ctr_init(struct crypto_tfm *tfm)
877{
878 struct cryptd_ablkcipher *cryptd_tfm;
879
880 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
881 if (IS_ERR(cryptd_tfm))
882 return PTR_ERR(cryptd_tfm);
883 ablk_init_common(tfm, cryptd_tfm);
884 return 0;
885}
886
887static struct crypto_alg ablk_ctr_alg = {
888 .cra_name = "ctr(serpent)",
889 .cra_driver_name = "ctr-serpent-sse2",
890 .cra_priority = 400,
891 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
892 .cra_blocksize = 1,
893 .cra_ctxsize = sizeof(struct async_serpent_ctx),
894 .cra_alignmask = 0,
895 .cra_type = &crypto_ablkcipher_type,
896 .cra_module = THIS_MODULE,
897 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
898 .cra_init = ablk_ctr_init,
899 .cra_exit = ablk_exit,
900 .cra_u = {
901 .ablkcipher = {
902 .min_keysize = SERPENT_MIN_KEY_SIZE,
903 .max_keysize = SERPENT_MAX_KEY_SIZE,
904 .ivsize = SERPENT_BLOCK_SIZE,
905 .setkey = ablk_set_key,
906 .encrypt = ablk_encrypt,
907 .decrypt = ablk_encrypt,
908 .geniv = "chainiv",
909 },
910 },
911};
912
913static int ablk_lrw_init(struct crypto_tfm *tfm)
914{
915 struct cryptd_ablkcipher *cryptd_tfm;
916
917 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0);
918 if (IS_ERR(cryptd_tfm))
919 return PTR_ERR(cryptd_tfm);
920 ablk_init_common(tfm, cryptd_tfm);
921 return 0;
922}
923
924static struct crypto_alg ablk_lrw_alg = {
925 .cra_name = "lrw(serpent)",
926 .cra_driver_name = "lrw-serpent-sse2",
927 .cra_priority = 400,
928 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
929 .cra_blocksize = SERPENT_BLOCK_SIZE,
930 .cra_ctxsize = sizeof(struct async_serpent_ctx),
931 .cra_alignmask = 0,
932 .cra_type = &crypto_ablkcipher_type,
933 .cra_module = THIS_MODULE,
934 .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
935 .cra_init = ablk_lrw_init,
936 .cra_exit = ablk_exit,
937 .cra_u = {
938 .ablkcipher = {
939 .min_keysize = SERPENT_MIN_KEY_SIZE +
940 SERPENT_BLOCK_SIZE,
941 .max_keysize = SERPENT_MAX_KEY_SIZE +
942 SERPENT_BLOCK_SIZE,
943 .ivsize = SERPENT_BLOCK_SIZE,
944 .setkey = ablk_set_key,
945 .encrypt = ablk_encrypt,
946 .decrypt = ablk_decrypt,
947 },
948 },
949};
950
951static int ablk_xts_init(struct crypto_tfm *tfm)
952{
953 struct cryptd_ablkcipher *cryptd_tfm;
954
955 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0);
956 if (IS_ERR(cryptd_tfm))
957 return PTR_ERR(cryptd_tfm);
958 ablk_init_common(tfm, cryptd_tfm);
959 return 0;
960}
961
962static struct crypto_alg ablk_xts_alg = {
963 .cra_name = "xts(serpent)",
964 .cra_driver_name = "xts-serpent-sse2",
965 .cra_priority = 400,
966 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
967 .cra_blocksize = SERPENT_BLOCK_SIZE,
968 .cra_ctxsize = sizeof(struct async_serpent_ctx),
969 .cra_alignmask = 0,
970 .cra_type = &crypto_ablkcipher_type,
971 .cra_module = THIS_MODULE,
972 .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
973 .cra_init = ablk_xts_init,
974 .cra_exit = ablk_exit,
975 .cra_u = {
976 .ablkcipher = {
977 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
978 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
979 .ivsize = SERPENT_BLOCK_SIZE,
980 .setkey = ablk_set_key,
981 .encrypt = ablk_encrypt,
982 .decrypt = ablk_decrypt,
983 },
984 },
985};
986
987static int __init serpent_sse2_init(void)
988{
989 int err;
990
991 if (!cpu_has_xmm2) {
992 printk(KERN_INFO "SSE2 instructions are not detected.\n");
993 return -ENODEV;
994 }
995
996 err = crypto_register_alg(&blk_ecb_alg);
997 if (err)
998 goto blk_ecb_err;
999 err = crypto_register_alg(&blk_cbc_alg);
1000 if (err)
1001 goto blk_cbc_err;
1002 err = crypto_register_alg(&blk_ctr_alg);
1003 if (err)
1004 goto blk_ctr_err;
1005 err = crypto_register_alg(&ablk_ecb_alg);
1006 if (err)
1007 goto ablk_ecb_err;
1008 err = crypto_register_alg(&ablk_cbc_alg);
1009 if (err)
1010 goto ablk_cbc_err;
1011 err = crypto_register_alg(&ablk_ctr_alg);
1012 if (err)
1013 goto ablk_ctr_err;
1014 err = crypto_register_alg(&blk_lrw_alg);
1015 if (err)
1016 goto blk_lrw_err;
1017 err = crypto_register_alg(&ablk_lrw_alg);
1018 if (err)
1019 goto ablk_lrw_err;
1020 err = crypto_register_alg(&blk_xts_alg);
1021 if (err)
1022 goto blk_xts_err;
1023 err = crypto_register_alg(&ablk_xts_alg);
1024 if (err)
1025 goto ablk_xts_err;
1026 return err;
1027
1028 crypto_unregister_alg(&ablk_xts_alg);
1029ablk_xts_err:
1030 crypto_unregister_alg(&blk_xts_alg);
1031blk_xts_err:
1032 crypto_unregister_alg(&ablk_lrw_alg);
1033ablk_lrw_err:
1034 crypto_unregister_alg(&blk_lrw_alg);
1035blk_lrw_err:
1036 crypto_unregister_alg(&ablk_ctr_alg);
1037ablk_ctr_err:
1038 crypto_unregister_alg(&ablk_cbc_alg);
1039ablk_cbc_err:
1040 crypto_unregister_alg(&ablk_ecb_alg);
1041ablk_ecb_err:
1042 crypto_unregister_alg(&blk_ctr_alg);
1043blk_ctr_err:
1044 crypto_unregister_alg(&blk_cbc_alg);
1045blk_cbc_err:
1046 crypto_unregister_alg(&blk_ecb_alg);
1047blk_ecb_err:
1048 return err;
1049}
1050
1051static void __exit serpent_sse2_exit(void)
1052{
1053 crypto_unregister_alg(&ablk_xts_alg);
1054 crypto_unregister_alg(&blk_xts_alg);
1055 crypto_unregister_alg(&ablk_lrw_alg);
1056 crypto_unregister_alg(&blk_lrw_alg);
1057 crypto_unregister_alg(&ablk_ctr_alg);
1058 crypto_unregister_alg(&ablk_cbc_alg);
1059 crypto_unregister_alg(&ablk_ecb_alg);
1060 crypto_unregister_alg(&blk_ctr_alg);
1061 crypto_unregister_alg(&blk_cbc_alg);
1062 crypto_unregister_alg(&blk_ecb_alg);
1063}
1064
1065module_init(serpent_sse2_init);
1066module_exit(serpent_sse2_exit);
1067
1068MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
1069MODULE_LICENSE("GPL");
1070MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 5ede9c444c3e..7fee8c152f93 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -32,6 +32,8 @@
32#include <crypto/algapi.h> 32#include <crypto/algapi.h>
33#include <crypto/twofish.h> 33#include <crypto/twofish.h>
34#include <crypto/b128ops.h> 34#include <crypto/b128ops.h>
35#include <crypto/lrw.h>
36#include <crypto/xts.h>
35 37
36/* regular block cipher functions from twofish_x86_64 module */ 38/* regular block cipher functions from twofish_x86_64 module */
37asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, 39asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
@@ -432,6 +434,209 @@ static struct crypto_alg blk_ctr_alg = {
432 }, 434 },
433}; 435};
434 436
437static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
438{
439 const unsigned int bsize = TF_BLOCK_SIZE;
440 struct twofish_ctx *ctx = priv;
441 int i;
442
443 if (nbytes == 3 * bsize) {
444 twofish_enc_blk_3way(ctx, srcdst, srcdst);
445 return;
446 }
447
448 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
449 twofish_enc_blk(ctx, srcdst, srcdst);
450}
451
452static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
453{
454 const unsigned int bsize = TF_BLOCK_SIZE;
455 struct twofish_ctx *ctx = priv;
456 int i;
457
458 if (nbytes == 3 * bsize) {
459 twofish_dec_blk_3way(ctx, srcdst, srcdst);
460 return;
461 }
462
463 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
464 twofish_dec_blk(ctx, srcdst, srcdst);
465}
466
467struct twofish_lrw_ctx {
468 struct lrw_table_ctx lrw_table;
469 struct twofish_ctx twofish_ctx;
470};
471
472static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
473 unsigned int keylen)
474{
475 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
476 int err;
477
478 err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE,
479 &tfm->crt_flags);
480 if (err)
481 return err;
482
483 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
484}
485
486static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
487 struct scatterlist *src, unsigned int nbytes)
488{
489 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
490 be128 buf[3];
491 struct lrw_crypt_req req = {
492 .tbuf = buf,
493 .tbuflen = sizeof(buf),
494
495 .table_ctx = &ctx->lrw_table,
496 .crypt_ctx = &ctx->twofish_ctx,
497 .crypt_fn = encrypt_callback,
498 };
499
500 return lrw_crypt(desc, dst, src, nbytes, &req);
501}
502
503static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
504 struct scatterlist *src, unsigned int nbytes)
505{
506 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
507 be128 buf[3];
508 struct lrw_crypt_req req = {
509 .tbuf = buf,
510 .tbuflen = sizeof(buf),
511
512 .table_ctx = &ctx->lrw_table,
513 .crypt_ctx = &ctx->twofish_ctx,
514 .crypt_fn = decrypt_callback,
515 };
516
517 return lrw_crypt(desc, dst, src, nbytes, &req);
518}
519
520static void lrw_exit_tfm(struct crypto_tfm *tfm)
521{
522 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
523
524 lrw_free_table(&ctx->lrw_table);
525}
526
527static struct crypto_alg blk_lrw_alg = {
528 .cra_name = "lrw(twofish)",
529 .cra_driver_name = "lrw-twofish-3way",
530 .cra_priority = 300,
531 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
532 .cra_blocksize = TF_BLOCK_SIZE,
533 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
534 .cra_alignmask = 0,
535 .cra_type = &crypto_blkcipher_type,
536 .cra_module = THIS_MODULE,
537 .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list),
538 .cra_exit = lrw_exit_tfm,
539 .cra_u = {
540 .blkcipher = {
541 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
542 .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE,
543 .ivsize = TF_BLOCK_SIZE,
544 .setkey = lrw_twofish_setkey,
545 .encrypt = lrw_encrypt,
546 .decrypt = lrw_decrypt,
547 },
548 },
549};
550
551struct twofish_xts_ctx {
552 struct twofish_ctx tweak_ctx;
553 struct twofish_ctx crypt_ctx;
554};
555
556static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
557 unsigned int keylen)
558{
559 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
560 u32 *flags = &tfm->crt_flags;
561 int err;
562
563 /* key consists of keys of equal size concatenated, therefore
564 * the length must be even
565 */
566 if (keylen % 2) {
567 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
568 return -EINVAL;
569 }
570
571 /* first half of xts-key is for crypt */
572 err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
573 if (err)
574 return err;
575
576 /* second half of xts-key is for tweak */
577 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
578 flags);
579}
580
581static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
582 struct scatterlist *src, unsigned int nbytes)
583{
584 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
585 be128 buf[3];
586 struct xts_crypt_req req = {
587 .tbuf = buf,
588 .tbuflen = sizeof(buf),
589
590 .tweak_ctx = &ctx->tweak_ctx,
591 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
592 .crypt_ctx = &ctx->crypt_ctx,
593 .crypt_fn = encrypt_callback,
594 };
595
596 return xts_crypt(desc, dst, src, nbytes, &req);
597}
598
599static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
600 struct scatterlist *src, unsigned int nbytes)
601{
602 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
603 be128 buf[3];
604 struct xts_crypt_req req = {
605 .tbuf = buf,
606 .tbuflen = sizeof(buf),
607
608 .tweak_ctx = &ctx->tweak_ctx,
609 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
610 .crypt_ctx = &ctx->crypt_ctx,
611 .crypt_fn = decrypt_callback,
612 };
613
614 return xts_crypt(desc, dst, src, nbytes, &req);
615}
616
617static struct crypto_alg blk_xts_alg = {
618 .cra_name = "xts(twofish)",
619 .cra_driver_name = "xts-twofish-3way",
620 .cra_priority = 300,
621 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
622 .cra_blocksize = TF_BLOCK_SIZE,
623 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
624 .cra_alignmask = 0,
625 .cra_type = &crypto_blkcipher_type,
626 .cra_module = THIS_MODULE,
627 .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list),
628 .cra_u = {
629 .blkcipher = {
630 .min_keysize = TF_MIN_KEY_SIZE * 2,
631 .max_keysize = TF_MAX_KEY_SIZE * 2,
632 .ivsize = TF_BLOCK_SIZE,
633 .setkey = xts_twofish_setkey,
634 .encrypt = xts_encrypt,
635 .decrypt = xts_decrypt,
636 },
637 },
638};
639
435int __init init(void) 640int __init init(void)
436{ 641{
437 int err; 642 int err;
@@ -445,9 +650,20 @@ int __init init(void)
445 err = crypto_register_alg(&blk_ctr_alg); 650 err = crypto_register_alg(&blk_ctr_alg);
446 if (err) 651 if (err)
447 goto ctr_err; 652 goto ctr_err;
653 err = crypto_register_alg(&blk_lrw_alg);
654 if (err)
655 goto blk_lrw_err;
656 err = crypto_register_alg(&blk_xts_alg);
657 if (err)
658 goto blk_xts_err;
448 659
449 return 0; 660 return 0;
450 661
662 crypto_unregister_alg(&blk_xts_alg);
663blk_xts_err:
664 crypto_unregister_alg(&blk_lrw_alg);
665blk_lrw_err:
666 crypto_unregister_alg(&blk_ctr_alg);
451ctr_err: 667ctr_err:
452 crypto_unregister_alg(&blk_cbc_alg); 668 crypto_unregister_alg(&blk_cbc_alg);
453cbc_err: 669cbc_err:
@@ -458,6 +674,8 @@ ecb_err:
458 674
459void __exit fini(void) 675void __exit fini(void)
460{ 676{
677 crypto_unregister_alg(&blk_xts_alg);
678 crypto_unregister_alg(&blk_lrw_alg);
461 crypto_unregister_alg(&blk_ctr_alg); 679 crypto_unregister_alg(&blk_ctr_alg);
462 crypto_unregister_alg(&blk_cbc_alg); 680 crypto_unregister_alg(&blk_cbc_alg);
463 crypto_unregister_alg(&blk_ecb_alg); 681 crypto_unregister_alg(&blk_ecb_alg);
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 52d0ccfcf6ea..455646e0e532 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,6 +3,7 @@
3# 3#
4 4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o 5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
6obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
6 7
7sysv-$(CONFIG_SYSVIPC) := ipc32.o 8sysv-$(CONFIG_SYSVIPC) := ipc32.o
8obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) 9obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 3e274564f6bf..e3e734005e19 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -14,6 +14,7 @@
14#include <asm/segment.h> 14#include <asm/segment.h>
15#include <asm/irqflags.h> 15#include <asm/irqflags.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17#include <linux/err.h>
17 18
18/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 19/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
19#include <linux/elf-em.h> 20#include <linux/elf-em.h>
@@ -27,8 +28,6 @@
27 28
28 .section .entry.text, "ax" 29 .section .entry.text, "ax"
29 30
30#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
31
32 .macro IA32_ARG_FIXUP noebp=0 31 .macro IA32_ARG_FIXUP noebp=0
33 movl %edi,%r8d 32 movl %edi,%r8d
34 .if \noebp 33 .if \noebp
@@ -191,7 +190,7 @@ sysexit_from_sys_call:
191 movl %ebx,%edx /* 3rd arg: 1st syscall arg */ 190 movl %ebx,%edx /* 3rd arg: 1st syscall arg */
192 movl %eax,%esi /* 2nd arg: syscall number */ 191 movl %eax,%esi /* 2nd arg: syscall number */
193 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ 192 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
194 call audit_syscall_entry 193 call __audit_syscall_entry
195 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ 194 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
196 cmpq $(IA32_NR_syscalls-1),%rax 195 cmpq $(IA32_NR_syscalls-1),%rax
197 ja ia32_badsys 196 ja ia32_badsys
@@ -208,12 +207,13 @@ sysexit_from_sys_call:
208 TRACE_IRQS_ON 207 TRACE_IRQS_ON
209 sti 208 sti
210 movl %eax,%esi /* second arg, syscall return value */ 209 movl %eax,%esi /* second arg, syscall return value */
211 cmpl $0,%eax /* is it < 0? */ 210 cmpl $-MAX_ERRNO,%eax /* is it an error ? */
212 setl %al /* 1 if so, 0 if not */ 211 jbe 1f
212 movslq %eax, %rsi /* if error sign extend to 64 bits */
2131: setbe %al /* 1 if error, 0 if not */
213 movzbl %al,%edi /* zero-extend that into %edi */ 214 movzbl %al,%edi /* zero-extend that into %edi */
214 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 215 call __audit_syscall_exit
215 call audit_syscall_exit 216 movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */
216 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
217 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 217 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
218 cli 218 cli
219 TRACE_IRQS_OFF 219 TRACE_IRQS_OFF
@@ -447,9 +447,6 @@ ia32_badsys:
447 movq $-ENOSYS,%rax 447 movq $-ENOSYS,%rax
448 jmp ia32_sysret 448 jmp ia32_sysret
449 449
450quiet_ni_syscall:
451 movq $-ENOSYS,%rax
452 ret
453 CFI_ENDPROC 450 CFI_ENDPROC
454 451
455 .macro PTREGSCALL label, func, arg 452 .macro PTREGSCALL label, func, arg
@@ -494,357 +491,3 @@ ia32_ptregs_common:
494 jmp ia32_sysret /* misbalances the return cache */ 491 jmp ia32_sysret /* misbalances the return cache */
495 CFI_ENDPROC 492 CFI_ENDPROC
496END(ia32_ptregs_common) 493END(ia32_ptregs_common)
497
498 .section .rodata,"a"
499 .align 8
500ia32_sys_call_table:
501 .quad sys_restart_syscall
502 .quad sys_exit
503 .quad stub32_fork
504 .quad sys_read
505 .quad sys_write
506 .quad compat_sys_open /* 5 */
507 .quad sys_close
508 .quad sys32_waitpid
509 .quad sys_creat
510 .quad sys_link
511 .quad sys_unlink /* 10 */
512 .quad stub32_execve
513 .quad sys_chdir
514 .quad compat_sys_time
515 .quad sys_mknod
516 .quad sys_chmod /* 15 */
517 .quad sys_lchown16
518 .quad quiet_ni_syscall /* old break syscall holder */
519 .quad sys_stat
520 .quad sys32_lseek
521 .quad sys_getpid /* 20 */
522 .quad compat_sys_mount /* mount */
523 .quad sys_oldumount /* old_umount */
524 .quad sys_setuid16
525 .quad sys_getuid16
526 .quad compat_sys_stime /* stime */ /* 25 */
527 .quad compat_sys_ptrace /* ptrace */
528 .quad sys_alarm
529 .quad sys_fstat /* (old)fstat */
530 .quad sys_pause
531 .quad compat_sys_utime /* 30 */
532 .quad quiet_ni_syscall /* old stty syscall holder */
533 .quad quiet_ni_syscall /* old gtty syscall holder */
534 .quad sys_access
535 .quad sys_nice
536 .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
537 .quad sys_sync
538 .quad sys32_kill
539 .quad sys_rename
540 .quad sys_mkdir
541 .quad sys_rmdir /* 40 */
542 .quad sys_dup
543 .quad sys_pipe
544 .quad compat_sys_times
545 .quad quiet_ni_syscall /* old prof syscall holder */
546 .quad sys_brk /* 45 */
547 .quad sys_setgid16
548 .quad sys_getgid16
549 .quad sys_signal
550 .quad sys_geteuid16
551 .quad sys_getegid16 /* 50 */
552 .quad sys_acct
553 .quad sys_umount /* new_umount */
554 .quad quiet_ni_syscall /* old lock syscall holder */
555 .quad compat_sys_ioctl
556 .quad compat_sys_fcntl64 /* 55 */
557 .quad quiet_ni_syscall /* old mpx syscall holder */
558 .quad sys_setpgid
559 .quad quiet_ni_syscall /* old ulimit syscall holder */
560 .quad sys_olduname
561 .quad sys_umask /* 60 */
562 .quad sys_chroot
563 .quad compat_sys_ustat
564 .quad sys_dup2
565 .quad sys_getppid
566 .quad sys_getpgrp /* 65 */
567 .quad sys_setsid
568 .quad sys32_sigaction
569 .quad sys_sgetmask
570 .quad sys_ssetmask
571 .quad sys_setreuid16 /* 70 */
572 .quad sys_setregid16
573 .quad sys32_sigsuspend
574 .quad compat_sys_sigpending
575 .quad sys_sethostname
576 .quad compat_sys_setrlimit /* 75 */
577 .quad compat_sys_old_getrlimit /* old_getrlimit */
578 .quad compat_sys_getrusage
579 .quad compat_sys_gettimeofday
580 .quad compat_sys_settimeofday
581 .quad sys_getgroups16 /* 80 */
582 .quad sys_setgroups16
583 .quad compat_sys_old_select
584 .quad sys_symlink
585 .quad sys_lstat
586 .quad sys_readlink /* 85 */
587 .quad sys_uselib
588 .quad sys_swapon
589 .quad sys_reboot
590 .quad compat_sys_old_readdir
591 .quad sys32_mmap /* 90 */
592 .quad sys_munmap
593 .quad sys_truncate
594 .quad sys_ftruncate
595 .quad sys_fchmod
596 .quad sys_fchown16 /* 95 */
597 .quad sys_getpriority
598 .quad sys_setpriority
599 .quad quiet_ni_syscall /* old profil syscall holder */
600 .quad compat_sys_statfs
601 .quad compat_sys_fstatfs /* 100 */
602 .quad sys_ioperm
603 .quad compat_sys_socketcall
604 .quad sys_syslog
605 .quad compat_sys_setitimer
606 .quad compat_sys_getitimer /* 105 */
607 .quad compat_sys_newstat
608 .quad compat_sys_newlstat
609 .quad compat_sys_newfstat
610 .quad sys_uname
611 .quad stub32_iopl /* 110 */
612 .quad sys_vhangup
613 .quad quiet_ni_syscall /* old "idle" system call */
614 .quad sys32_vm86_warning /* vm86old */
615 .quad compat_sys_wait4
616 .quad sys_swapoff /* 115 */
617 .quad compat_sys_sysinfo
618 .quad sys32_ipc
619 .quad sys_fsync
620 .quad stub32_sigreturn
621 .quad stub32_clone /* 120 */
622 .quad sys_setdomainname
623 .quad sys_newuname
624 .quad sys_modify_ldt
625 .quad compat_sys_adjtimex
626 .quad sys32_mprotect /* 125 */
627 .quad compat_sys_sigprocmask
628 .quad quiet_ni_syscall /* create_module */
629 .quad sys_init_module
630 .quad sys_delete_module
631 .quad quiet_ni_syscall /* 130 get_kernel_syms */
632 .quad sys32_quotactl
633 .quad sys_getpgid
634 .quad sys_fchdir
635 .quad quiet_ni_syscall /* bdflush */
636 .quad sys_sysfs /* 135 */
637 .quad sys_personality
638 .quad quiet_ni_syscall /* for afs_syscall */
639 .quad sys_setfsuid16
640 .quad sys_setfsgid16
641 .quad sys_llseek /* 140 */
642 .quad compat_sys_getdents
643 .quad compat_sys_select
644 .quad sys_flock
645 .quad sys_msync
646 .quad compat_sys_readv /* 145 */
647 .quad compat_sys_writev
648 .quad sys_getsid
649 .quad sys_fdatasync
650 .quad compat_sys_sysctl /* sysctl */
651 .quad sys_mlock /* 150 */
652 .quad sys_munlock
653 .quad sys_mlockall
654 .quad sys_munlockall
655 .quad sys_sched_setparam
656 .quad sys_sched_getparam /* 155 */
657 .quad sys_sched_setscheduler
658 .quad sys_sched_getscheduler
659 .quad sys_sched_yield
660 .quad sys_sched_get_priority_max
661 .quad sys_sched_get_priority_min /* 160 */
662 .quad sys32_sched_rr_get_interval
663 .quad compat_sys_nanosleep
664 .quad sys_mremap
665 .quad sys_setresuid16
666 .quad sys_getresuid16 /* 165 */
667 .quad sys32_vm86_warning /* vm86 */
668 .quad quiet_ni_syscall /* query_module */
669 .quad sys_poll
670 .quad quiet_ni_syscall /* old nfsservctl */
671 .quad sys_setresgid16 /* 170 */
672 .quad sys_getresgid16
673 .quad sys_prctl
674 .quad stub32_rt_sigreturn
675 .quad sys32_rt_sigaction
676 .quad sys32_rt_sigprocmask /* 175 */
677 .quad sys32_rt_sigpending
678 .quad compat_sys_rt_sigtimedwait
679 .quad sys32_rt_sigqueueinfo
680 .quad sys_rt_sigsuspend
681 .quad sys32_pread /* 180 */
682 .quad sys32_pwrite
683 .quad sys_chown16
684 .quad sys_getcwd
685 .quad sys_capget
686 .quad sys_capset
687 .quad stub32_sigaltstack
688 .quad sys32_sendfile
689 .quad quiet_ni_syscall /* streams1 */
690 .quad quiet_ni_syscall /* streams2 */
691 .quad stub32_vfork /* 190 */
692 .quad compat_sys_getrlimit
693 .quad sys_mmap_pgoff
694 .quad sys32_truncate64
695 .quad sys32_ftruncate64
696 .quad sys32_stat64 /* 195 */
697 .quad sys32_lstat64
698 .quad sys32_fstat64
699 .quad sys_lchown
700 .quad sys_getuid
701 .quad sys_getgid /* 200 */
702 .quad sys_geteuid
703 .quad sys_getegid
704 .quad sys_setreuid
705 .quad sys_setregid
706 .quad sys_getgroups /* 205 */
707 .quad sys_setgroups
708 .quad sys_fchown
709 .quad sys_setresuid
710 .quad sys_getresuid
711 .quad sys_setresgid /* 210 */
712 .quad sys_getresgid
713 .quad sys_chown
714 .quad sys_setuid
715 .quad sys_setgid
716 .quad sys_setfsuid /* 215 */
717 .quad sys_setfsgid
718 .quad sys_pivot_root
719 .quad sys_mincore
720 .quad sys_madvise
721 .quad compat_sys_getdents64 /* 220 getdents64 */
722 .quad compat_sys_fcntl64
723 .quad quiet_ni_syscall /* tux */
724 .quad quiet_ni_syscall /* security */
725 .quad sys_gettid
726 .quad sys32_readahead /* 225 */
727 .quad sys_setxattr
728 .quad sys_lsetxattr
729 .quad sys_fsetxattr
730 .quad sys_getxattr
731 .quad sys_lgetxattr /* 230 */
732 .quad sys_fgetxattr
733 .quad sys_listxattr
734 .quad sys_llistxattr
735 .quad sys_flistxattr
736 .quad sys_removexattr /* 235 */
737 .quad sys_lremovexattr
738 .quad sys_fremovexattr
739 .quad sys_tkill
740 .quad sys_sendfile64
741 .quad compat_sys_futex /* 240 */
742 .quad compat_sys_sched_setaffinity
743 .quad compat_sys_sched_getaffinity
744 .quad sys_set_thread_area
745 .quad sys_get_thread_area
746 .quad compat_sys_io_setup /* 245 */
747 .quad sys_io_destroy
748 .quad compat_sys_io_getevents
749 .quad compat_sys_io_submit
750 .quad sys_io_cancel
751 .quad sys32_fadvise64 /* 250 */
752 .quad quiet_ni_syscall /* free_huge_pages */
753 .quad sys_exit_group
754 .quad sys32_lookup_dcookie
755 .quad sys_epoll_create
756 .quad sys_epoll_ctl /* 255 */
757 .quad sys_epoll_wait
758 .quad sys_remap_file_pages
759 .quad sys_set_tid_address
760 .quad compat_sys_timer_create
761 .quad compat_sys_timer_settime /* 260 */
762 .quad compat_sys_timer_gettime
763 .quad sys_timer_getoverrun
764 .quad sys_timer_delete
765 .quad compat_sys_clock_settime
766 .quad compat_sys_clock_gettime /* 265 */
767 .quad compat_sys_clock_getres
768 .quad compat_sys_clock_nanosleep
769 .quad compat_sys_statfs64
770 .quad compat_sys_fstatfs64
771 .quad sys_tgkill /* 270 */
772 .quad compat_sys_utimes
773 .quad sys32_fadvise64_64
774 .quad quiet_ni_syscall /* sys_vserver */
775 .quad sys_mbind
776 .quad compat_sys_get_mempolicy /* 275 */
777 .quad sys_set_mempolicy
778 .quad compat_sys_mq_open
779 .quad sys_mq_unlink
780 .quad compat_sys_mq_timedsend
781 .quad compat_sys_mq_timedreceive /* 280 */
782 .quad compat_sys_mq_notify
783 .quad compat_sys_mq_getsetattr
784 .quad compat_sys_kexec_load /* reserved for kexec */
785 .quad compat_sys_waitid
786 .quad quiet_ni_syscall /* 285: sys_altroot */
787 .quad sys_add_key
788 .quad sys_request_key
789 .quad sys_keyctl
790 .quad sys_ioprio_set
791 .quad sys_ioprio_get /* 290 */
792 .quad sys_inotify_init
793 .quad sys_inotify_add_watch
794 .quad sys_inotify_rm_watch
795 .quad sys_migrate_pages
796 .quad compat_sys_openat /* 295 */
797 .quad sys_mkdirat
798 .quad sys_mknodat
799 .quad sys_fchownat
800 .quad compat_sys_futimesat
801 .quad sys32_fstatat /* 300 */
802 .quad sys_unlinkat
803 .quad sys_renameat
804 .quad sys_linkat
805 .quad sys_symlinkat
806 .quad sys_readlinkat /* 305 */
807 .quad sys_fchmodat
808 .quad sys_faccessat
809 .quad compat_sys_pselect6
810 .quad compat_sys_ppoll
811 .quad sys_unshare /* 310 */
812 .quad compat_sys_set_robust_list
813 .quad compat_sys_get_robust_list
814 .quad sys_splice
815 .quad sys32_sync_file_range
816 .quad sys_tee /* 315 */
817 .quad compat_sys_vmsplice
818 .quad compat_sys_move_pages
819 .quad sys_getcpu
820 .quad sys_epoll_pwait
821 .quad compat_sys_utimensat /* 320 */
822 .quad compat_sys_signalfd
823 .quad sys_timerfd_create
824 .quad sys_eventfd
825 .quad sys32_fallocate
826 .quad compat_sys_timerfd_settime /* 325 */
827 .quad compat_sys_timerfd_gettime
828 .quad compat_sys_signalfd4
829 .quad sys_eventfd2
830 .quad sys_epoll_create1
831 .quad sys_dup3 /* 330 */
832 .quad sys_pipe2
833 .quad sys_inotify_init1
834 .quad compat_sys_preadv
835 .quad compat_sys_pwritev
836 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
837 .quad sys_perf_event_open
838 .quad compat_sys_recvmmsg
839 .quad sys_fanotify_init
840 .quad sys32_fanotify_mark
841 .quad sys_prlimit64 /* 340 */
842 .quad sys_name_to_handle_at
843 .quad compat_sys_open_by_handle_at
844 .quad compat_sys_clock_adjtime
845 .quad sys_syncfs
846 .quad compat_sys_sendmmsg /* 345 */
847 .quad sys_setns
848 .quad compat_sys_process_vm_readv
849 .quad compat_sys_process_vm_writev
850ia32_syscall_end:
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
new file mode 100644
index 000000000000..51ecd5b4e787
--- /dev/null
+++ b/arch/x86/ia32/nosyscall.c
@@ -0,0 +1,7 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3
4long compat_ni_syscall(void)
5{
6 return -ENOSYS;
7}
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
new file mode 100644
index 000000000000..4754ba0f5d9f
--- /dev/null
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -0,0 +1,25 @@
1/* System call table for ia32 emulation. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
13
14typedef void (*sys_call_ptr_t)(void);
15
16extern void compat_ni_syscall(void);
17
18const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 6fa90a845e4c..b57e6a43a37a 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -19,7 +19,8 @@ header-y += processor-flags.h
19header-y += ptrace-abi.h 19header-y += ptrace-abi.h
20header-y += sigcontext32.h 20header-y += sigcontext32.h
21header-y += ucontext.h 21header-y += ucontext.h
22header-y += unistd_32.h
23header-y += unistd_64.h
24header-y += vm86.h 22header-y += vm86.h
25header-y += vsyscall.h 23header-y += vsyscall.h
24
25genhdr-y += unistd_32.h
26genhdr-y += unistd_64.h
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 8e41071704a5..49ad773f4b9f 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_AMD_NB_H 1#ifndef _ASM_X86_AMD_NB_H
2#define _ASM_X86_AMD_NB_H 2#define _ASM_X86_AMD_NB_H
3 3
4#include <linux/ioport.h>
4#include <linux/pci.h> 5#include <linux/pci.h>
5 6
6struct amd_nb_bus_dev_range { 7struct amd_nb_bus_dev_range {
@@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 14extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14 15
15extern bool early_is_amd_nb(u32 value); 16extern bool early_is_amd_nb(u32 value);
17extern struct resource *amd_get_mmconfig_range(struct resource *res);
16extern int amd_cache_northbridges(void); 18extern int amd_cache_northbridges(void);
17extern void amd_flush_garts(void); 19extern void amd_flush_garts(void);
18extern int amd_numa_init(void); 20extern int amd_numa_init(void);
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index e020d88ec02d..2f90c51cc49d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -64,6 +64,8 @@ struct setup_header {
64 __u32 payload_offset; 64 __u32 payload_offset;
65 __u32 payload_length; 65 __u32 payload_length;
66 __u64 setup_data; 66 __u64 setup_data;
67 __u64 pref_address;
68 __u32 init_size;
67} __attribute__((packed)); 69} __attribute__((packed));
68 70
69struct sys_desc_table { 71struct sys_desc_table {
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f3444f700f36..17c5d4bdee5e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -197,7 +197,10 @@
197 197
198/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 198/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
199#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 199#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
200#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
201#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
200#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ 202#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
203#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */
201#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ 204#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
202 205
203#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 206#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 078ad0caefc6..b903d5ea3941 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump);
101 101
102extern void hw_breakpoint_restore(void); 102extern void hw_breakpoint_restore(void);
103 103
104#ifdef CONFIG_X86_64
105DECLARE_PER_CPU(int, debug_stack_usage);
106static inline void debug_stack_usage_inc(void)
107{
108 __get_cpu_var(debug_stack_usage)++;
109}
110static inline void debug_stack_usage_dec(void)
111{
112 __get_cpu_var(debug_stack_usage)--;
113}
114int is_debug_stack(unsigned long addr);
115void debug_stack_set_zero(void);
116void debug_stack_reset(void);
117#else /* !X86_64 */
118static inline int is_debug_stack(unsigned long addr) { return 0; }
119static inline void debug_stack_set_zero(void) { }
120static inline void debug_stack_reset(void) { }
121static inline void debug_stack_usage_inc(void) { }
122static inline void debug_stack_usage_dec(void) { }
123#endif /* X86_64 */
124
125
104#endif /* __KERNEL__ */ 126#endif /* __KERNEL__ */
105 127
106#endif /* _ASM_X86_DEBUGREG_H */ 128#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 41935fadfdfc..e95822d683f4 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
35 35
36extern struct desc_ptr idt_descr; 36extern struct desc_ptr idt_descr;
37extern gate_desc idt_table[]; 37extern gate_desc idt_table[];
38extern struct desc_ptr nmi_idt_descr;
39extern gate_desc nmi_idt_table[];
38 40
39struct gdt_page { 41struct gdt_page {
40 struct desc_struct gdt[GDT_ENTRIES]; 42 struct desc_struct gdt[GDT_ENTRIES];
@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
307 desc->limit = (limit >> 16) & 0xf; 309 desc->limit = (limit >> 16) & 0xf;
308} 310}
309 311
312#ifdef CONFIG_X86_64
313static inline void set_nmi_gate(int gate, void *addr)
314{
315 gate_desc s;
316
317 pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
318 write_idt_entry(nmi_idt_table, gate, &s);
319}
320#endif
321
310static inline void _set_gate(int gate, unsigned type, void *addr, 322static inline void _set_gate(int gate, unsigned type, void *addr,
311 unsigned dpl, unsigned ist, unsigned seg) 323 unsigned dpl, unsigned ist, unsigned seg)
312{ 324{
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 7093e4a6a0bc..844f735fd63a 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,8 @@
3 3
4#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
5 5
6#define EFI_LOADER_SIGNATURE "EL32"
7
6extern unsigned long asmlinkage efi_call_phys(void *, ...); 8extern unsigned long asmlinkage efi_call_phys(void *, ...);
7 9
8#define efi_call_phys0(f) efi_call_phys(f) 10#define efi_call_phys0(f) efi_call_phys(f)
@@ -37,6 +39,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
37 39
38#else /* !CONFIG_X86_32 */ 40#else /* !CONFIG_X86_32 */
39 41
42#define EFI_LOADER_SIGNATURE "EL64"
43
40extern u64 efi_call0(void *fp); 44extern u64 efi_call0(void *fp);
41extern u64 efi_call1(void *fp, u64 arg1); 45extern u64 efi_call1(void *fp, u64 arg1);
42extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); 46extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 460c74e4852c..4da3c0c4c974 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,7 +117,7 @@ enum fixed_addresses {
117#endif 117#endif
118 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ 118 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
119 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ 119 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
120#ifdef CONFIG_X86_MRST 120#ifdef CONFIG_X86_INTEL_MID
121 FIX_LNW_VRTC, 121 FIX_LNW_VRTC,
122#endif 122#endif
123 __end_of_permanent_fixed_addresses, 123 __end_of_permanent_fixed_addresses,
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
index 976f6ecd2ce6..b0d5716ca1e4 100644
--- a/arch/x86/include/asm/ia32_unistd.h
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -2,17 +2,10 @@
2#define _ASM_X86_IA32_UNISTD_H 2#define _ASM_X86_IA32_UNISTD_H
3 3
4/* 4/*
5 * This file contains the system call numbers of the ia32 port, 5 * This file contains the system call numbers of the ia32 compat ABI,
6 * this is for the kernel only. 6 * this is for the kernel only.
7 * Only add syscalls here where some part of the kernel needs to know
8 * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
9 */ 7 */
10 8#define __SYSCALL_ia32_NR(x) (x)
11#define __NR_ia32_restart_syscall 0 9#include <asm/unistd_32_ia32.h>
12#define __NR_ia32_exit 1
13#define __NR_ia32_read 3
14#define __NR_ia32_write 4
15#define __NR_ia32_sigreturn 119
16#define __NR_ia32_rt_sigreturn 173
17 10
18#endif /* _ASM_X86_IA32_UNISTD_H */ 11#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8dbe353e41e1..adcc0ae73d09 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,6 +5,8 @@
5extern void __init early_ioremap_page_table_range_init(void); 5extern void __init early_ioremap_page_table_range_init(void);
6#endif 6#endif
7 7
8extern void __init zone_sizes_init(void);
9
8extern unsigned long __init 10extern unsigned long __init
9kernel_physical_mapping_init(unsigned long start, 11kernel_physical_mapping_init(unsigned long start,
10 unsigned long end, 12 unsigned long end,
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 345c99cef152..dffc38ee6255 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops;
5extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
6extern int iommu_detected; 6extern int iommu_detected;
7extern int iommu_pass_through; 7extern int iommu_pass_through;
8extern int iommu_group_mf;
8 9
9/* 10 seconds */ 10/* 10 seconds */
10#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a026507893e9..ab4092e3214e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -181,6 +181,7 @@ struct x86_emulate_ops {
181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); 181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
182 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); 182 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
183 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); 183 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
184 int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
184 void (*halt)(struct x86_emulate_ctxt *ctxt); 185 void (*halt)(struct x86_emulate_ctxt *ctxt);
185 void (*wbinvd)(struct x86_emulate_ctxt *ctxt); 186 void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
186 int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); 187 int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
@@ -364,6 +365,7 @@ enum x86_intercept {
364#endif 365#endif
365 366
366int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); 367int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
368bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
367#define EMULATION_FAILED -1 369#define EMULATION_FAILED -1
368#define EMULATION_OK 0 370#define EMULATION_OK 0
369#define EMULATION_RESTART 1 371#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4973f4dab98..52d6640a5ca1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,10 +16,12 @@
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h> 17#include <linux/tracepoint.h>
18#include <linux/cpumask.h> 18#include <linux/cpumask.h>
19#include <linux/irq_work.h>
19 20
20#include <linux/kvm.h> 21#include <linux/kvm.h>
21#include <linux/kvm_para.h> 22#include <linux/kvm_para.h>
22#include <linux/kvm_types.h> 23#include <linux/kvm_types.h>
24#include <linux/perf_event.h>
23 25
24#include <asm/pvclock-abi.h> 26#include <asm/pvclock-abi.h>
25#include <asm/desc.h> 27#include <asm/desc.h>
@@ -31,6 +33,8 @@
31#define KVM_MEMORY_SLOTS 32 33#define KVM_MEMORY_SLOTS 32
32/* memory slots that does not exposed to userspace */ 34/* memory slots that does not exposed to userspace */
33#define KVM_PRIVATE_MEM_SLOTS 4 35#define KVM_PRIVATE_MEM_SLOTS 4
36#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
37
34#define KVM_MMIO_SIZE 16 38#define KVM_MMIO_SIZE 16
35 39
36#define KVM_PIO_PAGE_OFFSET 1 40#define KVM_PIO_PAGE_OFFSET 1
@@ -228,7 +232,7 @@ struct kvm_mmu_page {
228 * One bit set per slot which has memory 232 * One bit set per slot which has memory
229 * in this shadow page. 233 * in this shadow page.
230 */ 234 */
231 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 235 DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
232 bool unsync; 236 bool unsync;
233 int root_count; /* Currently serving as active root */ 237 int root_count; /* Currently serving as active root */
234 unsigned int unsync_children; 238 unsigned int unsync_children;
@@ -239,14 +243,9 @@ struct kvm_mmu_page {
239 int clear_spte_count; 243 int clear_spte_count;
240#endif 244#endif
241 245
242 struct rcu_head rcu; 246 int write_flooding_count;
243};
244 247
245struct kvm_pv_mmu_op_buffer { 248 struct rcu_head rcu;
246 void *ptr;
247 unsigned len;
248 unsigned processed;
249 char buf[512] __aligned(sizeof(long));
250}; 249};
251 250
252struct kvm_pio_request { 251struct kvm_pio_request {
@@ -294,6 +293,37 @@ struct kvm_mmu {
294 u64 pdptrs[4]; /* pae */ 293 u64 pdptrs[4]; /* pae */
295}; 294};
296 295
296enum pmc_type {
297 KVM_PMC_GP = 0,
298 KVM_PMC_FIXED,
299};
300
301struct kvm_pmc {
302 enum pmc_type type;
303 u8 idx;
304 u64 counter;
305 u64 eventsel;
306 struct perf_event *perf_event;
307 struct kvm_vcpu *vcpu;
308};
309
310struct kvm_pmu {
311 unsigned nr_arch_gp_counters;
312 unsigned nr_arch_fixed_counters;
313 unsigned available_event_types;
314 u64 fixed_ctr_ctrl;
315 u64 global_ctrl;
316 u64 global_status;
317 u64 global_ovf_ctrl;
318 u64 counter_bitmask[2];
319 u64 global_ctrl_mask;
320 u8 version;
321 struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC];
322 struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED];
323 struct irq_work irq_work;
324 u64 reprogram_pmi;
325};
326
297struct kvm_vcpu_arch { 327struct kvm_vcpu_arch {
298 /* 328 /*
299 * rip and regs accesses must go through 329 * rip and regs accesses must go through
@@ -345,19 +375,10 @@ struct kvm_vcpu_arch {
345 */ 375 */
346 struct kvm_mmu *walk_mmu; 376 struct kvm_mmu *walk_mmu;
347 377
348 /* only needed in kvm_pv_mmu_op() path, but it's hot so
349 * put it here to avoid allocation */
350 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
351
352 struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; 378 struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
353 struct kvm_mmu_memory_cache mmu_page_cache; 379 struct kvm_mmu_memory_cache mmu_page_cache;
354 struct kvm_mmu_memory_cache mmu_page_header_cache; 380 struct kvm_mmu_memory_cache mmu_page_header_cache;
355 381
356 gfn_t last_pt_write_gfn;
357 int last_pt_write_count;
358 u64 *last_pte_updated;
359 gfn_t last_pte_gfn;
360
361 struct fpu guest_fpu; 382 struct fpu guest_fpu;
362 u64 xcr0; 383 u64 xcr0;
363 384
@@ -436,6 +457,8 @@ struct kvm_vcpu_arch {
436 unsigned access; 457 unsigned access;
437 gfn_t mmio_gfn; 458 gfn_t mmio_gfn;
438 459
460 struct kvm_pmu pmu;
461
439 /* used for guest single stepping over the given code position */ 462 /* used for guest single stepping over the given code position */
440 unsigned long singlestep_rip; 463 unsigned long singlestep_rip;
441 464
@@ -444,6 +467,9 @@ struct kvm_vcpu_arch {
444 467
445 cpumask_var_t wbinvd_dirty_mask; 468 cpumask_var_t wbinvd_dirty_mask;
446 469
470 unsigned long last_retry_eip;
471 unsigned long last_retry_addr;
472
447 struct { 473 struct {
448 bool halted; 474 bool halted;
449 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; 475 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
@@ -459,7 +485,6 @@ struct kvm_arch {
459 unsigned int n_requested_mmu_pages; 485 unsigned int n_requested_mmu_pages;
460 unsigned int n_max_mmu_pages; 486 unsigned int n_max_mmu_pages;
461 unsigned int indirect_shadow_pages; 487 unsigned int indirect_shadow_pages;
462 atomic_t invlpg_counter;
463 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 488 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
464 /* 489 /*
465 * Hash table of struct kvm_mmu_page. 490 * Hash table of struct kvm_mmu_page.
@@ -660,6 +685,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
660 685
661int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 686int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
662void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 687void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
688int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
689 struct kvm_memory_slot *slot);
663void kvm_mmu_zap_all(struct kvm *kvm); 690void kvm_mmu_zap_all(struct kvm *kvm);
664unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 691unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
665void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 692void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
@@ -668,8 +695,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
668 695
669int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 696int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
670 const void *val, int bytes); 697 const void *val, int bytes);
671int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
672 gpa_t addr, unsigned long *ret);
673u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); 698u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
674 699
675extern bool tdp_enabled; 700extern bool tdp_enabled;
@@ -692,6 +717,7 @@ enum emulation_result {
692#define EMULTYPE_NO_DECODE (1 << 0) 717#define EMULTYPE_NO_DECODE (1 << 0)
693#define EMULTYPE_TRAP_UD (1 << 1) 718#define EMULTYPE_TRAP_UD (1 << 1)
694#define EMULTYPE_SKIP (1 << 2) 719#define EMULTYPE_SKIP (1 << 2)
720#define EMULTYPE_RETRY (1 << 3)
695int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, 721int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
696 int emulation_type, void *insn, int insn_len); 722 int emulation_type, void *insn, int insn_len);
697 723
@@ -734,6 +760,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
734 760
735unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); 761unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
736void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 762void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
763bool kvm_rdpmc(struct kvm_vcpu *vcpu);
737 764
738void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 765void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
739void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 766void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -754,13 +781,14 @@ int fx_init(struct kvm_vcpu *vcpu);
754 781
755void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 782void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
756void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 783void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
757 const u8 *new, int bytes, 784 const u8 *new, int bytes);
758 bool guest_initiated); 785int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
759int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 786int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
760void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); 787void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
761int kvm_mmu_load(struct kvm_vcpu *vcpu); 788int kvm_mmu_load(struct kvm_vcpu *vcpu);
762void kvm_mmu_unload(struct kvm_vcpu *vcpu); 789void kvm_mmu_unload(struct kvm_vcpu *vcpu);
763void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 790void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
791gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
764gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, 792gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
765 struct x86_exception *exception); 793 struct x86_exception *exception);
766gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, 794gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -782,6 +810,11 @@ void kvm_disable_tdp(void);
782int complete_pio(struct kvm_vcpu *vcpu); 810int complete_pio(struct kvm_vcpu *vcpu);
783bool kvm_check_iopl(struct kvm_vcpu *vcpu); 811bool kvm_check_iopl(struct kvm_vcpu *vcpu);
784 812
813static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
814{
815 return gpa;
816}
817
785static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 818static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
786{ 819{
787 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 820 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -894,4 +927,17 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
894 927
895void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); 928void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
896 929
930int kvm_is_in_guest(void);
931
932void kvm_pmu_init(struct kvm_vcpu *vcpu);
933void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
934void kvm_pmu_reset(struct kvm_vcpu *vcpu);
935void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
936bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
937int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
938int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
939int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
940void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
941void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
942
897#endif /* _ASM_X86_KVM_HOST_H */ 943#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6add827381c9..6aefb14cbbc5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {}
151 151
152void mce_setup(struct mce *m); 152void mce_setup(struct mce *m);
153void mce_log(struct mce *m); 153void mce_log(struct mce *m);
154DECLARE_PER_CPU(struct sys_device, mce_sysdev); 154extern struct device *mce_device[CONFIG_NR_CPUS];
155 155
156/* 156/*
157 * Maximum banks number. 157 * Maximum banks number.
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 93f79094c224..0a0a95460434 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -67,7 +67,7 @@ extern struct console early_mrst_console;
67extern void mrst_early_console_init(void); 67extern void mrst_early_console_init(void);
68 68
69extern struct console early_hsu_console; 69extern struct console early_hsu_console;
70extern void hsu_early_console_init(void); 70extern void hsu_early_console_init(const char *);
71 71
72extern void intel_scu_devices_create(void); 72extern void intel_scu_devices_create(void);
73extern void intel_scu_devices_destroy(void); 73extern void intel_scu_devices_destroy(void);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d498943b906c..df75d07571ce 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq)
112{ 112{
113 x86_msi.teardown_msi_irq(irq); 113 x86_msi.teardown_msi_irq(irq);
114} 114}
115static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
116{
117 x86_msi.restore_msi_irqs(dev, irq);
118}
115#define arch_setup_msi_irqs x86_setup_msi_irqs 119#define arch_setup_msi_irqs x86_setup_msi_irqs
116#define arch_teardown_msi_irqs x86_teardown_msi_irqs 120#define arch_teardown_msi_irqs x86_teardown_msi_irqs
117#define arch_teardown_msi_irq x86_teardown_msi_irq 121#define arch_teardown_msi_irq x86_teardown_msi_irq
122#define arch_restore_msi_irqs x86_restore_msi_irqs
118/* implemented in arch/x86/kernel/apic/io_apic. */ 123/* implemented in arch/x86/kernel/apic/io_apic. */
119int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); 124int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
120void native_teardown_msi_irq(unsigned int irq); 125void native_teardown_msi_irq(unsigned int irq);
126void native_restore_msi_irqs(struct pci_dev *dev, int irq);
121/* default to the implementation in drivers/lib/msi.c */ 127/* default to the implementation in drivers/lib/msi.c */
122#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS 128#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
129#define HAVE_DEFAULT_MSI_RESTORE_IRQS
123void default_teardown_msi_irqs(struct pci_dev *dev); 130void default_teardown_msi_irqs(struct pci_dev *dev);
131void default_restore_msi_irqs(struct pci_dev *dev, int irq);
124#else 132#else
125#define native_setup_msi_irqs NULL 133#define native_setup_msi_irqs NULL
126#define native_teardown_msi_irq NULL 134#define native_teardown_msi_irq NULL
127#define default_teardown_msi_irqs NULL 135#define default_teardown_msi_irqs NULL
136#define default_restore_msi_irqs NULL
128#endif 137#endif
129 138
130#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 139#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e38197806853..b3a531746026 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -44,8 +44,6 @@ enum pci_bf_sort_state {
44 44
45/* pci-i386.c */ 45/* pci-i386.c */
46 46
47extern unsigned int pcibios_max_latency;
48
49void pcibios_resource_survey(void); 47void pcibios_resource_survey(void);
50void pcibios_set_cache_line_size(void); 48void pcibios_set_cache_line_size(void);
51 49
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 529bf07e8067..7a11910a63c4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -414,22 +414,6 @@ do { \
414#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) 414#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
415#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) 415#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
416 416
417#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
418#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
419#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
420#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
421#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
422#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
423#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
424#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
425#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
426#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
427#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
428#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
429#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
430#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
431#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
432
433#ifndef CONFIG_M386 417#ifndef CONFIG_M386
434#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) 418#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
435#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) 419#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
@@ -445,9 +429,6 @@ do { \
445#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 429#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
446#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 430#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
447 431
448#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
449#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
450#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
451#endif /* !CONFIG_M386 */ 432#endif /* !CONFIG_M386 */
452 433
453#ifdef CONFIG_X86_CMPXCHG64 434#ifdef CONFIG_X86_CMPXCHG64
@@ -464,7 +445,6 @@ do { \
464 445
465#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double 446#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
466#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double 447#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
467#define irqsafe_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
468#endif /* CONFIG_X86_CMPXCHG64 */ 448#endif /* CONFIG_X86_CMPXCHG64 */
469 449
470/* 450/*
@@ -492,13 +472,6 @@ do { \
492#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) 472#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
493#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 473#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
494 474
495#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
496#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
497#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
498#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
499#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
500#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
501
502/* 475/*
503 * Pretty complex macro to generate cmpxchg16 instruction. The instruction 476 * Pretty complex macro to generate cmpxchg16 instruction. The instruction
504 * is not supported on early AMD64 processors so we must be able to emulate 477 * is not supported on early AMD64 processors so we must be able to emulate
@@ -521,7 +494,6 @@ do { \
521 494
522#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double 495#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
523#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double 496#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
524#define irqsafe_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
525 497
526#endif 498#endif
527 499
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
new file mode 100644
index 000000000000..d3ef63fe0c81
--- /dev/null
+++ b/arch/x86/include/asm/serpent.h
@@ -0,0 +1,63 @@
1#ifndef ASM_X86_SERPENT_H
2#define ASM_X86_SERPENT_H
3
4#include <linux/crypto.h>
5#include <crypto/serpent.h>
6
7#ifdef CONFIG_X86_32
8
9#define SERPENT_PARALLEL_BLOCKS 4
10
11asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src, bool xor);
13asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst,
14 const u8 *src);
15
16static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
17 const u8 *src)
18{
19 __serpent_enc_blk_4way(ctx, dst, src, false);
20}
21
22static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
23 const u8 *src)
24{
25 __serpent_enc_blk_4way(ctx, dst, src, true);
26}
27
28static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
29 const u8 *src)
30{
31 serpent_dec_blk_4way(ctx, dst, src);
32}
33
34#else
35
36#define SERPENT_PARALLEL_BLOCKS 8
37
38asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
39 const u8 *src, bool xor);
40asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
41 const u8 *src);
42
43static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
44 const u8 *src)
45{
46 __serpent_enc_blk_8way(ctx, dst, src, false);
47}
48
49static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
50 const u8 *src)
51{
52 __serpent_enc_blk_8way(ctx, dst, src, true);
53}
54
55static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
56 const u8 *src)
57{
58 serpent_dec_blk_8way(ctx, dst, src);
59}
60
61#endif
62
63#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9756551ec760..d0f19f9fb846 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void);
47extern void i386_reserve_resources(void); 47extern void i386_reserve_resources(void);
48extern void setup_default_timer_irq(void); 48extern void setup_default_timer_irq(void);
49 49
50#ifdef CONFIG_X86_MRST 50#ifdef CONFIG_X86_INTEL_MID
51extern void x86_mrst_early_setup(void); 51extern void x86_mrst_early_setup(void);
52#else 52#else
53static inline void x86_mrst_early_setup(void) { } 53static inline void x86_mrst_early_setup(void) { }
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 73b11bc0ae6f..0434c400287c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void);
225 225
226#endif /* CONFIG_X86_LOCAL_APIC */ 226#endif /* CONFIG_X86_LOCAL_APIC */
227 227
228#ifdef CONFIG_DEBUG_NMI_SELFTEST
229extern void nmi_selftest(void);
230#else
231#define nmi_selftest() do { } while (0)
232#endif
233
228#endif /* __ASSEMBLY__ */ 234#endif /* __ASSEMBLY__ */
229#endif /* _ASM_X86_SMP_H */ 235#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c4a348f7bd43..d962e5652a73 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/err.h> 17#include <linux/err.h>
18#include <asm/asm-offsets.h> /* For NR_syscalls */
18 19
19extern const unsigned long sys_call_table[]; 20extern const unsigned long sys_call_table[];
20 21
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 56a63ff7665e..bc817cd8b443 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -91,7 +91,6 @@ struct thread_info {
91#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ 91#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
92#define TIF_DEBUG 21 /* uses debug registers */ 92#define TIF_DEBUG 21 /* uses debug registers */
93#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 93#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
94#define TIF_FREEZE 23 /* is freezing for suspend */
95#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
96#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ 95#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ 96#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
@@ -113,7 +112,6 @@ struct thread_info {
113#define _TIF_FORK (1 << TIF_FORK) 112#define _TIF_FORK (1 << TIF_FORK)
114#define _TIF_DEBUG (1 << TIF_DEBUG) 113#define _TIF_DEBUG (1 << TIF_DEBUG)
115#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 114#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
116#define _TIF_FREEZE (1 << TIF_FREEZE)
117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 115#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
118#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) 116#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
119#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) 117#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 800f77c60051..b9676ae37ada 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -172,7 +172,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
172} 172}
173 173
174struct pci_bus; 174struct pci_bus;
175void x86_pci_root_bus_res_quirks(struct pci_bus *b); 175void x86_pci_root_bus_resources(int bus, struct list_head *resources);
176 176
177#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
178#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ 178#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 2a58ed3e51d8..b4a3db7ce140 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -1,13 +1,59 @@
1#ifndef _ASM_X86_UNISTD_H
2#define _ASM_X86_UNISTD_H 1
3
1#ifdef __KERNEL__ 4#ifdef __KERNEL__
2# ifdef CONFIG_X86_32 5# ifdef CONFIG_X86_32
3# include "unistd_32.h" 6
7# include <asm/unistd_32.h>
8# define __ARCH_WANT_IPC_PARSE_VERSION
9# define __ARCH_WANT_STAT64
10# define __ARCH_WANT_SYS_OLD_MMAP
11# define __ARCH_WANT_SYS_OLD_SELECT
12
4# else 13# else
5# include "unistd_64.h" 14
15# include <asm/unistd_64.h>
16# define __ARCH_WANT_COMPAT_SYS_TIME
17
6# endif 18# endif
19
20# define __ARCH_WANT_OLD_READDIR
21# define __ARCH_WANT_OLD_STAT
22# define __ARCH_WANT_SYS_ALARM
23# define __ARCH_WANT_SYS_FADVISE64
24# define __ARCH_WANT_SYS_GETHOSTNAME
25# define __ARCH_WANT_SYS_GETPGRP
26# define __ARCH_WANT_SYS_LLSEEK
27# define __ARCH_WANT_SYS_NICE
28# define __ARCH_WANT_SYS_OLDUMOUNT
29# define __ARCH_WANT_SYS_OLD_GETRLIMIT
30# define __ARCH_WANT_SYS_OLD_UNAME
31# define __ARCH_WANT_SYS_PAUSE
32# define __ARCH_WANT_SYS_RT_SIGACTION
33# define __ARCH_WANT_SYS_RT_SIGSUSPEND
34# define __ARCH_WANT_SYS_SGETMASK
35# define __ARCH_WANT_SYS_SIGNAL
36# define __ARCH_WANT_SYS_SIGPENDING
37# define __ARCH_WANT_SYS_SIGPROCMASK
38# define __ARCH_WANT_SYS_SOCKETCALL
39# define __ARCH_WANT_SYS_TIME
40# define __ARCH_WANT_SYS_UTIME
41# define __ARCH_WANT_SYS_WAITPID
42
43/*
44 * "Conditional" syscalls
45 *
46 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
47 * but it doesn't work on all toolchains, so we just do it by hand
48 */
49# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
50
7#else 51#else
8# ifdef __i386__ 52# ifdef __i386__
9# include "unistd_32.h" 53# include <asm/unistd_32.h>
10# else 54# else
11# include "unistd_64.h" 55# include <asm/unistd_64.h>
12# endif 56# endif
13#endif 57#endif
58
59#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
deleted file mode 100644
index 599c77d38f33..000000000000
--- a/arch/x86/include/asm/unistd_32.h
+++ /dev/null
@@ -1,401 +0,0 @@
1#ifndef _ASM_X86_UNISTD_32_H
2#define _ASM_X86_UNISTD_32_H
3
4/*
5 * This file contains the system call numbers.
6 */
7
8#define __NR_restart_syscall 0
9#define __NR_exit 1
10#define __NR_fork 2
11#define __NR_read 3
12#define __NR_write 4
13#define __NR_open 5
14#define __NR_close 6
15#define __NR_waitpid 7
16#define __NR_creat 8
17#define __NR_link 9
18#define __NR_unlink 10
19#define __NR_execve 11
20#define __NR_chdir 12
21#define __NR_time 13
22#define __NR_mknod 14
23#define __NR_chmod 15
24#define __NR_lchown 16
25#define __NR_break 17
26#define __NR_oldstat 18
27#define __NR_lseek 19
28#define __NR_getpid 20
29#define __NR_mount 21
30#define __NR_umount 22
31#define __NR_setuid 23
32#define __NR_getuid 24
33#define __NR_stime 25
34#define __NR_ptrace 26
35#define __NR_alarm 27
36#define __NR_oldfstat 28
37#define __NR_pause 29
38#define __NR_utime 30
39#define __NR_stty 31
40#define __NR_gtty 32
41#define __NR_access 33
42#define __NR_nice 34
43#define __NR_ftime 35
44#define __NR_sync 36
45#define __NR_kill 37
46#define __NR_rename 38
47#define __NR_mkdir 39
48#define __NR_rmdir 40
49#define __NR_dup 41
50#define __NR_pipe 42
51#define __NR_times 43
52#define __NR_prof 44
53#define __NR_brk 45
54#define __NR_setgid 46
55#define __NR_getgid 47
56#define __NR_signal 48
57#define __NR_geteuid 49
58#define __NR_getegid 50
59#define __NR_acct 51
60#define __NR_umount2 52
61#define __NR_lock 53
62#define __NR_ioctl 54
63#define __NR_fcntl 55
64#define __NR_mpx 56
65#define __NR_setpgid 57
66#define __NR_ulimit 58
67#define __NR_oldolduname 59
68#define __NR_umask 60
69#define __NR_chroot 61
70#define __NR_ustat 62
71#define __NR_dup2 63
72#define __NR_getppid 64
73#define __NR_getpgrp 65
74#define __NR_setsid 66
75#define __NR_sigaction 67
76#define __NR_sgetmask 68
77#define __NR_ssetmask 69
78#define __NR_setreuid 70
79#define __NR_setregid 71
80#define __NR_sigsuspend 72
81#define __NR_sigpending 73
82#define __NR_sethostname 74
83#define __NR_setrlimit 75
84#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */
85#define __NR_getrusage 77
86#define __NR_gettimeofday 78
87#define __NR_settimeofday 79
88#define __NR_getgroups 80
89#define __NR_setgroups 81
90#define __NR_select 82
91#define __NR_symlink 83
92#define __NR_oldlstat 84
93#define __NR_readlink 85
94#define __NR_uselib 86
95#define __NR_swapon 87
96#define __NR_reboot 88
97#define __NR_readdir 89
98#define __NR_mmap 90
99#define __NR_munmap 91
100#define __NR_truncate 92
101#define __NR_ftruncate 93
102#define __NR_fchmod 94
103#define __NR_fchown 95
104#define __NR_getpriority 96
105#define __NR_setpriority 97
106#define __NR_profil 98
107#define __NR_statfs 99
108#define __NR_fstatfs 100
109#define __NR_ioperm 101
110#define __NR_socketcall 102
111#define __NR_syslog 103
112#define __NR_setitimer 104
113#define __NR_getitimer 105
114#define __NR_stat 106
115#define __NR_lstat 107
116#define __NR_fstat 108
117#define __NR_olduname 109
118#define __NR_iopl 110
119#define __NR_vhangup 111
120#define __NR_idle 112
121#define __NR_vm86old 113
122#define __NR_wait4 114
123#define __NR_swapoff 115
124#define __NR_sysinfo 116
125#define __NR_ipc 117
126#define __NR_fsync 118
127#define __NR_sigreturn 119
128#define __NR_clone 120
129#define __NR_setdomainname 121
130#define __NR_uname 122
131#define __NR_modify_ldt 123
132#define __NR_adjtimex 124
133#define __NR_mprotect 125
134#define __NR_sigprocmask 126
135#define __NR_create_module 127
136#define __NR_init_module 128
137#define __NR_delete_module 129
138#define __NR_get_kernel_syms 130
139#define __NR_quotactl 131
140#define __NR_getpgid 132
141#define __NR_fchdir 133
142#define __NR_bdflush 134
143#define __NR_sysfs 135
144#define __NR_personality 136
145#define __NR_afs_syscall 137 /* Syscall for Andrew File System */
146#define __NR_setfsuid 138
147#define __NR_setfsgid 139
148#define __NR__llseek 140
149#define __NR_getdents 141
150#define __NR__newselect 142
151#define __NR_flock 143
152#define __NR_msync 144
153#define __NR_readv 145
154#define __NR_writev 146
155#define __NR_getsid 147
156#define __NR_fdatasync 148
157#define __NR__sysctl 149
158#define __NR_mlock 150
159#define __NR_munlock 151
160#define __NR_mlockall 152
161#define __NR_munlockall 153
162#define __NR_sched_setparam 154
163#define __NR_sched_getparam 155
164#define __NR_sched_setscheduler 156
165#define __NR_sched_getscheduler 157
166#define __NR_sched_yield 158
167#define __NR_sched_get_priority_max 159
168#define __NR_sched_get_priority_min 160
169#define __NR_sched_rr_get_interval 161
170#define __NR_nanosleep 162
171#define __NR_mremap 163
172#define __NR_setresuid 164
173#define __NR_getresuid 165
174#define __NR_vm86 166
175#define __NR_query_module 167
176#define __NR_poll 168
177#define __NR_nfsservctl 169
178#define __NR_setresgid 170
179#define __NR_getresgid 171
180#define __NR_prctl 172
181#define __NR_rt_sigreturn 173
182#define __NR_rt_sigaction 174
183#define __NR_rt_sigprocmask 175
184#define __NR_rt_sigpending 176
185#define __NR_rt_sigtimedwait 177
186#define __NR_rt_sigqueueinfo 178
187#define __NR_rt_sigsuspend 179
188#define __NR_pread64 180
189#define __NR_pwrite64 181
190#define __NR_chown 182
191#define __NR_getcwd 183
192#define __NR_capget 184
193#define __NR_capset 185
194#define __NR_sigaltstack 186
195#define __NR_sendfile 187
196#define __NR_getpmsg 188 /* some people actually want streams */
197#define __NR_putpmsg 189 /* some people actually want streams */
198#define __NR_vfork 190
199#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */
200#define __NR_mmap2 192
201#define __NR_truncate64 193
202#define __NR_ftruncate64 194
203#define __NR_stat64 195
204#define __NR_lstat64 196
205#define __NR_fstat64 197
206#define __NR_lchown32 198
207#define __NR_getuid32 199
208#define __NR_getgid32 200
209#define __NR_geteuid32 201
210#define __NR_getegid32 202
211#define __NR_setreuid32 203
212#define __NR_setregid32 204
213#define __NR_getgroups32 205
214#define __NR_setgroups32 206
215#define __NR_fchown32 207
216#define __NR_setresuid32 208
217#define __NR_getresuid32 209
218#define __NR_setresgid32 210
219#define __NR_getresgid32 211
220#define __NR_chown32 212
221#define __NR_setuid32 213
222#define __NR_setgid32 214
223#define __NR_setfsuid32 215
224#define __NR_setfsgid32 216
225#define __NR_pivot_root 217
226#define __NR_mincore 218
227#define __NR_madvise 219
228#define __NR_madvise1 219 /* delete when C lib stub is removed */
229#define __NR_getdents64 220
230#define __NR_fcntl64 221
231/* 223 is unused */
232#define __NR_gettid 224
233#define __NR_readahead 225
234#define __NR_setxattr 226
235#define __NR_lsetxattr 227
236#define __NR_fsetxattr 228
237#define __NR_getxattr 229
238#define __NR_lgetxattr 230
239#define __NR_fgetxattr 231
240#define __NR_listxattr 232
241#define __NR_llistxattr 233
242#define __NR_flistxattr 234
243#define __NR_removexattr 235
244#define __NR_lremovexattr 236
245#define __NR_fremovexattr 237
246#define __NR_tkill 238
247#define __NR_sendfile64 239
248#define __NR_futex 240
249#define __NR_sched_setaffinity 241
250#define __NR_sched_getaffinity 242
251#define __NR_set_thread_area 243
252#define __NR_get_thread_area 244
253#define __NR_io_setup 245
254#define __NR_io_destroy 246
255#define __NR_io_getevents 247
256#define __NR_io_submit 248
257#define __NR_io_cancel 249
258#define __NR_fadvise64 250
259/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
260#define __NR_exit_group 252
261#define __NR_lookup_dcookie 253
262#define __NR_epoll_create 254
263#define __NR_epoll_ctl 255
264#define __NR_epoll_wait 256
265#define __NR_remap_file_pages 257
266#define __NR_set_tid_address 258
267#define __NR_timer_create 259
268#define __NR_timer_settime (__NR_timer_create+1)
269#define __NR_timer_gettime (__NR_timer_create+2)
270#define __NR_timer_getoverrun (__NR_timer_create+3)
271#define __NR_timer_delete (__NR_timer_create+4)
272#define __NR_clock_settime (__NR_timer_create+5)
273#define __NR_clock_gettime (__NR_timer_create+6)
274#define __NR_clock_getres (__NR_timer_create+7)
275#define __NR_clock_nanosleep (__NR_timer_create+8)
276#define __NR_statfs64 268
277#define __NR_fstatfs64 269
278#define __NR_tgkill 270
279#define __NR_utimes 271
280#define __NR_fadvise64_64 272
281#define __NR_vserver 273
282#define __NR_mbind 274
283#define __NR_get_mempolicy 275
284#define __NR_set_mempolicy 276
285#define __NR_mq_open 277
286#define __NR_mq_unlink (__NR_mq_open+1)
287#define __NR_mq_timedsend (__NR_mq_open+2)
288#define __NR_mq_timedreceive (__NR_mq_open+3)
289#define __NR_mq_notify (__NR_mq_open+4)
290#define __NR_mq_getsetattr (__NR_mq_open+5)
291#define __NR_kexec_load 283
292#define __NR_waitid 284
293/* #define __NR_sys_setaltroot 285 */
294#define __NR_add_key 286
295#define __NR_request_key 287
296#define __NR_keyctl 288
297#define __NR_ioprio_set 289
298#define __NR_ioprio_get 290
299#define __NR_inotify_init 291
300#define __NR_inotify_add_watch 292
301#define __NR_inotify_rm_watch 293
302#define __NR_migrate_pages 294
303#define __NR_openat 295
304#define __NR_mkdirat 296
305#define __NR_mknodat 297
306#define __NR_fchownat 298
307#define __NR_futimesat 299
308#define __NR_fstatat64 300
309#define __NR_unlinkat 301
310#define __NR_renameat 302
311#define __NR_linkat 303
312#define __NR_symlinkat 304
313#define __NR_readlinkat 305
314#define __NR_fchmodat 306
315#define __NR_faccessat 307
316#define __NR_pselect6 308
317#define __NR_ppoll 309
318#define __NR_unshare 310
319#define __NR_set_robust_list 311
320#define __NR_get_robust_list 312
321#define __NR_splice 313
322#define __NR_sync_file_range 314
323#define __NR_tee 315
324#define __NR_vmsplice 316
325#define __NR_move_pages 317
326#define __NR_getcpu 318
327#define __NR_epoll_pwait 319
328#define __NR_utimensat 320
329#define __NR_signalfd 321
330#define __NR_timerfd_create 322
331#define __NR_eventfd 323
332#define __NR_fallocate 324
333#define __NR_timerfd_settime 325
334#define __NR_timerfd_gettime 326
335#define __NR_signalfd4 327
336#define __NR_eventfd2 328
337#define __NR_epoll_create1 329
338#define __NR_dup3 330
339#define __NR_pipe2 331
340#define __NR_inotify_init1 332
341#define __NR_preadv 333
342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337
346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340
349#define __NR_name_to_handle_at 341
350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344
353#define __NR_sendmmsg 345
354#define __NR_setns 346
355#define __NR_process_vm_readv 347
356#define __NR_process_vm_writev 348
357
358#ifdef __KERNEL__
359
360#define NR_syscalls 349
361
362#define __ARCH_WANT_IPC_PARSE_VERSION
363#define __ARCH_WANT_OLD_READDIR
364#define __ARCH_WANT_OLD_STAT
365#define __ARCH_WANT_STAT64
366#define __ARCH_WANT_SYS_ALARM
367#define __ARCH_WANT_SYS_GETHOSTNAME
368#define __ARCH_WANT_SYS_IPC
369#define __ARCH_WANT_SYS_PAUSE
370#define __ARCH_WANT_SYS_SGETMASK
371#define __ARCH_WANT_SYS_SIGNAL
372#define __ARCH_WANT_SYS_TIME
373#define __ARCH_WANT_SYS_UTIME
374#define __ARCH_WANT_SYS_WAITPID
375#define __ARCH_WANT_SYS_SOCKETCALL
376#define __ARCH_WANT_SYS_FADVISE64
377#define __ARCH_WANT_SYS_GETPGRP
378#define __ARCH_WANT_SYS_LLSEEK
379#define __ARCH_WANT_SYS_NICE
380#define __ARCH_WANT_SYS_OLD_GETRLIMIT
381#define __ARCH_WANT_SYS_OLD_UNAME
382#define __ARCH_WANT_SYS_OLD_MMAP
383#define __ARCH_WANT_SYS_OLD_SELECT
384#define __ARCH_WANT_SYS_OLDUMOUNT
385#define __ARCH_WANT_SYS_SIGPENDING
386#define __ARCH_WANT_SYS_SIGPROCMASK
387#define __ARCH_WANT_SYS_RT_SIGACTION
388#define __ARCH_WANT_SYS_RT_SIGSUSPEND
389
390/*
391 * "Conditional" syscalls
392 *
393 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
394 * but it doesn't work on all toolchains, so we just do it by hand
395 */
396#ifndef cond_syscall
397#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
398#endif
399
400#endif /* __KERNEL__ */
401#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
deleted file mode 100644
index 0431f193c3f2..000000000000
--- a/arch/x86/include/asm/unistd_64.h
+++ /dev/null
@@ -1,732 +0,0 @@
1#ifndef _ASM_X86_UNISTD_64_H
2#define _ASM_X86_UNISTD_64_H
3
4#ifndef __SYSCALL
5#define __SYSCALL(a, b)
6#endif
7
8/*
9 * This file contains the system call numbers.
10 *
11 * Note: holes are not allowed.
12 */
13
14/* at least 8 syscall per cacheline */
15#define __NR_read 0
16__SYSCALL(__NR_read, sys_read)
17#define __NR_write 1
18__SYSCALL(__NR_write, sys_write)
19#define __NR_open 2
20__SYSCALL(__NR_open, sys_open)
21#define __NR_close 3
22__SYSCALL(__NR_close, sys_close)
23#define __NR_stat 4
24__SYSCALL(__NR_stat, sys_newstat)
25#define __NR_fstat 5
26__SYSCALL(__NR_fstat, sys_newfstat)
27#define __NR_lstat 6
28__SYSCALL(__NR_lstat, sys_newlstat)
29#define __NR_poll 7
30__SYSCALL(__NR_poll, sys_poll)
31
32#define __NR_lseek 8
33__SYSCALL(__NR_lseek, sys_lseek)
34#define __NR_mmap 9
35__SYSCALL(__NR_mmap, sys_mmap)
36#define __NR_mprotect 10
37__SYSCALL(__NR_mprotect, sys_mprotect)
38#define __NR_munmap 11
39__SYSCALL(__NR_munmap, sys_munmap)
40#define __NR_brk 12
41__SYSCALL(__NR_brk, sys_brk)
42#define __NR_rt_sigaction 13
43__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
44#define __NR_rt_sigprocmask 14
45__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
46#define __NR_rt_sigreturn 15
47__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
48
49#define __NR_ioctl 16
50__SYSCALL(__NR_ioctl, sys_ioctl)
51#define __NR_pread64 17
52__SYSCALL(__NR_pread64, sys_pread64)
53#define __NR_pwrite64 18
54__SYSCALL(__NR_pwrite64, sys_pwrite64)
55#define __NR_readv 19
56__SYSCALL(__NR_readv, sys_readv)
57#define __NR_writev 20
58__SYSCALL(__NR_writev, sys_writev)
59#define __NR_access 21
60__SYSCALL(__NR_access, sys_access)
61#define __NR_pipe 22
62__SYSCALL(__NR_pipe, sys_pipe)
63#define __NR_select 23
64__SYSCALL(__NR_select, sys_select)
65
66#define __NR_sched_yield 24
67__SYSCALL(__NR_sched_yield, sys_sched_yield)
68#define __NR_mremap 25
69__SYSCALL(__NR_mremap, sys_mremap)
70#define __NR_msync 26
71__SYSCALL(__NR_msync, sys_msync)
72#define __NR_mincore 27
73__SYSCALL(__NR_mincore, sys_mincore)
74#define __NR_madvise 28
75__SYSCALL(__NR_madvise, sys_madvise)
76#define __NR_shmget 29
77__SYSCALL(__NR_shmget, sys_shmget)
78#define __NR_shmat 30
79__SYSCALL(__NR_shmat, sys_shmat)
80#define __NR_shmctl 31
81__SYSCALL(__NR_shmctl, sys_shmctl)
82
83#define __NR_dup 32
84__SYSCALL(__NR_dup, sys_dup)
85#define __NR_dup2 33
86__SYSCALL(__NR_dup2, sys_dup2)
87#define __NR_pause 34
88__SYSCALL(__NR_pause, sys_pause)
89#define __NR_nanosleep 35
90__SYSCALL(__NR_nanosleep, sys_nanosleep)
91#define __NR_getitimer 36
92__SYSCALL(__NR_getitimer, sys_getitimer)
93#define __NR_alarm 37
94__SYSCALL(__NR_alarm, sys_alarm)
95#define __NR_setitimer 38
96__SYSCALL(__NR_setitimer, sys_setitimer)
97#define __NR_getpid 39
98__SYSCALL(__NR_getpid, sys_getpid)
99
100#define __NR_sendfile 40
101__SYSCALL(__NR_sendfile, sys_sendfile64)
102#define __NR_socket 41
103__SYSCALL(__NR_socket, sys_socket)
104#define __NR_connect 42
105__SYSCALL(__NR_connect, sys_connect)
106#define __NR_accept 43
107__SYSCALL(__NR_accept, sys_accept)
108#define __NR_sendto 44
109__SYSCALL(__NR_sendto, sys_sendto)
110#define __NR_recvfrom 45
111__SYSCALL(__NR_recvfrom, sys_recvfrom)
112#define __NR_sendmsg 46
113__SYSCALL(__NR_sendmsg, sys_sendmsg)
114#define __NR_recvmsg 47
115__SYSCALL(__NR_recvmsg, sys_recvmsg)
116
117#define __NR_shutdown 48
118__SYSCALL(__NR_shutdown, sys_shutdown)
119#define __NR_bind 49
120__SYSCALL(__NR_bind, sys_bind)
121#define __NR_listen 50
122__SYSCALL(__NR_listen, sys_listen)
123#define __NR_getsockname 51
124__SYSCALL(__NR_getsockname, sys_getsockname)
125#define __NR_getpeername 52
126__SYSCALL(__NR_getpeername, sys_getpeername)
127#define __NR_socketpair 53
128__SYSCALL(__NR_socketpair, sys_socketpair)
129#define __NR_setsockopt 54
130__SYSCALL(__NR_setsockopt, sys_setsockopt)
131#define __NR_getsockopt 55
132__SYSCALL(__NR_getsockopt, sys_getsockopt)
133
134#define __NR_clone 56
135__SYSCALL(__NR_clone, stub_clone)
136#define __NR_fork 57
137__SYSCALL(__NR_fork, stub_fork)
138#define __NR_vfork 58
139__SYSCALL(__NR_vfork, stub_vfork)
140#define __NR_execve 59
141__SYSCALL(__NR_execve, stub_execve)
142#define __NR_exit 60
143__SYSCALL(__NR_exit, sys_exit)
144#define __NR_wait4 61
145__SYSCALL(__NR_wait4, sys_wait4)
146#define __NR_kill 62
147__SYSCALL(__NR_kill, sys_kill)
148#define __NR_uname 63
149__SYSCALL(__NR_uname, sys_newuname)
150
151#define __NR_semget 64
152__SYSCALL(__NR_semget, sys_semget)
153#define __NR_semop 65
154__SYSCALL(__NR_semop, sys_semop)
155#define __NR_semctl 66
156__SYSCALL(__NR_semctl, sys_semctl)
157#define __NR_shmdt 67
158__SYSCALL(__NR_shmdt, sys_shmdt)
159#define __NR_msgget 68
160__SYSCALL(__NR_msgget, sys_msgget)
161#define __NR_msgsnd 69
162__SYSCALL(__NR_msgsnd, sys_msgsnd)
163#define __NR_msgrcv 70
164__SYSCALL(__NR_msgrcv, sys_msgrcv)
165#define __NR_msgctl 71
166__SYSCALL(__NR_msgctl, sys_msgctl)
167
168#define __NR_fcntl 72
169__SYSCALL(__NR_fcntl, sys_fcntl)
170#define __NR_flock 73
171__SYSCALL(__NR_flock, sys_flock)
172#define __NR_fsync 74
173__SYSCALL(__NR_fsync, sys_fsync)
174#define __NR_fdatasync 75
175__SYSCALL(__NR_fdatasync, sys_fdatasync)
176#define __NR_truncate 76
177__SYSCALL(__NR_truncate, sys_truncate)
178#define __NR_ftruncate 77
179__SYSCALL(__NR_ftruncate, sys_ftruncate)
180#define __NR_getdents 78
181__SYSCALL(__NR_getdents, sys_getdents)
182#define __NR_getcwd 79
183__SYSCALL(__NR_getcwd, sys_getcwd)
184
185#define __NR_chdir 80
186__SYSCALL(__NR_chdir, sys_chdir)
187#define __NR_fchdir 81
188__SYSCALL(__NR_fchdir, sys_fchdir)
189#define __NR_rename 82
190__SYSCALL(__NR_rename, sys_rename)
191#define __NR_mkdir 83
192__SYSCALL(__NR_mkdir, sys_mkdir)
193#define __NR_rmdir 84
194__SYSCALL(__NR_rmdir, sys_rmdir)
195#define __NR_creat 85
196__SYSCALL(__NR_creat, sys_creat)
197#define __NR_link 86
198__SYSCALL(__NR_link, sys_link)
199#define __NR_unlink 87
200__SYSCALL(__NR_unlink, sys_unlink)
201
202#define __NR_symlink 88
203__SYSCALL(__NR_symlink, sys_symlink)
204#define __NR_readlink 89
205__SYSCALL(__NR_readlink, sys_readlink)
206#define __NR_chmod 90
207__SYSCALL(__NR_chmod, sys_chmod)
208#define __NR_fchmod 91
209__SYSCALL(__NR_fchmod, sys_fchmod)
210#define __NR_chown 92
211__SYSCALL(__NR_chown, sys_chown)
212#define __NR_fchown 93
213__SYSCALL(__NR_fchown, sys_fchown)
214#define __NR_lchown 94
215__SYSCALL(__NR_lchown, sys_lchown)
216#define __NR_umask 95
217__SYSCALL(__NR_umask, sys_umask)
218
219#define __NR_gettimeofday 96
220__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
221#define __NR_getrlimit 97
222__SYSCALL(__NR_getrlimit, sys_getrlimit)
223#define __NR_getrusage 98
224__SYSCALL(__NR_getrusage, sys_getrusage)
225#define __NR_sysinfo 99
226__SYSCALL(__NR_sysinfo, sys_sysinfo)
227#define __NR_times 100
228__SYSCALL(__NR_times, sys_times)
229#define __NR_ptrace 101
230__SYSCALL(__NR_ptrace, sys_ptrace)
231#define __NR_getuid 102
232__SYSCALL(__NR_getuid, sys_getuid)
233#define __NR_syslog 103
234__SYSCALL(__NR_syslog, sys_syslog)
235
236/* at the very end the stuff that never runs during the benchmarks */
237#define __NR_getgid 104
238__SYSCALL(__NR_getgid, sys_getgid)
239#define __NR_setuid 105
240__SYSCALL(__NR_setuid, sys_setuid)
241#define __NR_setgid 106
242__SYSCALL(__NR_setgid, sys_setgid)
243#define __NR_geteuid 107
244__SYSCALL(__NR_geteuid, sys_geteuid)
245#define __NR_getegid 108
246__SYSCALL(__NR_getegid, sys_getegid)
247#define __NR_setpgid 109
248__SYSCALL(__NR_setpgid, sys_setpgid)
249#define __NR_getppid 110
250__SYSCALL(__NR_getppid, sys_getppid)
251#define __NR_getpgrp 111
252__SYSCALL(__NR_getpgrp, sys_getpgrp)
253
254#define __NR_setsid 112
255__SYSCALL(__NR_setsid, sys_setsid)
256#define __NR_setreuid 113
257__SYSCALL(__NR_setreuid, sys_setreuid)
258#define __NR_setregid 114
259__SYSCALL(__NR_setregid, sys_setregid)
260#define __NR_getgroups 115
261__SYSCALL(__NR_getgroups, sys_getgroups)
262#define __NR_setgroups 116
263__SYSCALL(__NR_setgroups, sys_setgroups)
264#define __NR_setresuid 117
265__SYSCALL(__NR_setresuid, sys_setresuid)
266#define __NR_getresuid 118
267__SYSCALL(__NR_getresuid, sys_getresuid)
268#define __NR_setresgid 119
269__SYSCALL(__NR_setresgid, sys_setresgid)
270
271#define __NR_getresgid 120
272__SYSCALL(__NR_getresgid, sys_getresgid)
273#define __NR_getpgid 121
274__SYSCALL(__NR_getpgid, sys_getpgid)
275#define __NR_setfsuid 122
276__SYSCALL(__NR_setfsuid, sys_setfsuid)
277#define __NR_setfsgid 123
278__SYSCALL(__NR_setfsgid, sys_setfsgid)
279#define __NR_getsid 124
280__SYSCALL(__NR_getsid, sys_getsid)
281#define __NR_capget 125
282__SYSCALL(__NR_capget, sys_capget)
283#define __NR_capset 126
284__SYSCALL(__NR_capset, sys_capset)
285
286#define __NR_rt_sigpending 127
287__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
288#define __NR_rt_sigtimedwait 128
289__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
290#define __NR_rt_sigqueueinfo 129
291__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
292#define __NR_rt_sigsuspend 130
293__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
294#define __NR_sigaltstack 131
295__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
296#define __NR_utime 132
297__SYSCALL(__NR_utime, sys_utime)
298#define __NR_mknod 133
299__SYSCALL(__NR_mknod, sys_mknod)
300
301/* Only needed for a.out */
302#define __NR_uselib 134
303__SYSCALL(__NR_uselib, sys_ni_syscall)
304#define __NR_personality 135
305__SYSCALL(__NR_personality, sys_personality)
306
307#define __NR_ustat 136
308__SYSCALL(__NR_ustat, sys_ustat)
309#define __NR_statfs 137
310__SYSCALL(__NR_statfs, sys_statfs)
311#define __NR_fstatfs 138
312__SYSCALL(__NR_fstatfs, sys_fstatfs)
313#define __NR_sysfs 139
314__SYSCALL(__NR_sysfs, sys_sysfs)
315
316#define __NR_getpriority 140
317__SYSCALL(__NR_getpriority, sys_getpriority)
318#define __NR_setpriority 141
319__SYSCALL(__NR_setpriority, sys_setpriority)
320#define __NR_sched_setparam 142
321__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
322#define __NR_sched_getparam 143
323__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
324#define __NR_sched_setscheduler 144
325__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
326#define __NR_sched_getscheduler 145
327__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
328#define __NR_sched_get_priority_max 146
329__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
330#define __NR_sched_get_priority_min 147
331__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
332#define __NR_sched_rr_get_interval 148
333__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
334
335#define __NR_mlock 149
336__SYSCALL(__NR_mlock, sys_mlock)
337#define __NR_munlock 150
338__SYSCALL(__NR_munlock, sys_munlock)
339#define __NR_mlockall 151
340__SYSCALL(__NR_mlockall, sys_mlockall)
341#define __NR_munlockall 152
342__SYSCALL(__NR_munlockall, sys_munlockall)
343
344#define __NR_vhangup 153
345__SYSCALL(__NR_vhangup, sys_vhangup)
346
347#define __NR_modify_ldt 154
348__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
349
350#define __NR_pivot_root 155
351__SYSCALL(__NR_pivot_root, sys_pivot_root)
352
353#define __NR__sysctl 156
354__SYSCALL(__NR__sysctl, sys_sysctl)
355
356#define __NR_prctl 157
357__SYSCALL(__NR_prctl, sys_prctl)
358#define __NR_arch_prctl 158
359__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
360
361#define __NR_adjtimex 159
362__SYSCALL(__NR_adjtimex, sys_adjtimex)
363
364#define __NR_setrlimit 160
365__SYSCALL(__NR_setrlimit, sys_setrlimit)
366
367#define __NR_chroot 161
368__SYSCALL(__NR_chroot, sys_chroot)
369
370#define __NR_sync 162
371__SYSCALL(__NR_sync, sys_sync)
372
373#define __NR_acct 163
374__SYSCALL(__NR_acct, sys_acct)
375
376#define __NR_settimeofday 164
377__SYSCALL(__NR_settimeofday, sys_settimeofday)
378
379#define __NR_mount 165
380__SYSCALL(__NR_mount, sys_mount)
381#define __NR_umount2 166
382__SYSCALL(__NR_umount2, sys_umount)
383
384#define __NR_swapon 167
385__SYSCALL(__NR_swapon, sys_swapon)
386#define __NR_swapoff 168
387__SYSCALL(__NR_swapoff, sys_swapoff)
388
389#define __NR_reboot 169
390__SYSCALL(__NR_reboot, sys_reboot)
391
392#define __NR_sethostname 170
393__SYSCALL(__NR_sethostname, sys_sethostname)
394#define __NR_setdomainname 171
395__SYSCALL(__NR_setdomainname, sys_setdomainname)
396
397#define __NR_iopl 172
398__SYSCALL(__NR_iopl, stub_iopl)
399#define __NR_ioperm 173
400__SYSCALL(__NR_ioperm, sys_ioperm)
401
402#define __NR_create_module 174
403__SYSCALL(__NR_create_module, sys_ni_syscall)
404#define __NR_init_module 175
405__SYSCALL(__NR_init_module, sys_init_module)
406#define __NR_delete_module 176
407__SYSCALL(__NR_delete_module, sys_delete_module)
408#define __NR_get_kernel_syms 177
409__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
410#define __NR_query_module 178
411__SYSCALL(__NR_query_module, sys_ni_syscall)
412
413#define __NR_quotactl 179
414__SYSCALL(__NR_quotactl, sys_quotactl)
415
416#define __NR_nfsservctl 180
417__SYSCALL(__NR_nfsservctl, sys_ni_syscall)
418
419/* reserved for LiS/STREAMS */
420#define __NR_getpmsg 181
421__SYSCALL(__NR_getpmsg, sys_ni_syscall)
422#define __NR_putpmsg 182
423__SYSCALL(__NR_putpmsg, sys_ni_syscall)
424
425/* reserved for AFS */
426#define __NR_afs_syscall 183
427__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
428
429/* reserved for tux */
430#define __NR_tuxcall 184
431__SYSCALL(__NR_tuxcall, sys_ni_syscall)
432
433#define __NR_security 185
434__SYSCALL(__NR_security, sys_ni_syscall)
435
436#define __NR_gettid 186
437__SYSCALL(__NR_gettid, sys_gettid)
438
439#define __NR_readahead 187
440__SYSCALL(__NR_readahead, sys_readahead)
441#define __NR_setxattr 188
442__SYSCALL(__NR_setxattr, sys_setxattr)
443#define __NR_lsetxattr 189
444__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
445#define __NR_fsetxattr 190
446__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
447#define __NR_getxattr 191
448__SYSCALL(__NR_getxattr, sys_getxattr)
449#define __NR_lgetxattr 192
450__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
451#define __NR_fgetxattr 193
452__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
453#define __NR_listxattr 194
454__SYSCALL(__NR_listxattr, sys_listxattr)
455#define __NR_llistxattr 195
456__SYSCALL(__NR_llistxattr, sys_llistxattr)
457#define __NR_flistxattr 196
458__SYSCALL(__NR_flistxattr, sys_flistxattr)
459#define __NR_removexattr 197
460__SYSCALL(__NR_removexattr, sys_removexattr)
461#define __NR_lremovexattr 198
462__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
463#define __NR_fremovexattr 199
464__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
465#define __NR_tkill 200
466__SYSCALL(__NR_tkill, sys_tkill)
467#define __NR_time 201
468__SYSCALL(__NR_time, sys_time)
469#define __NR_futex 202
470__SYSCALL(__NR_futex, sys_futex)
471#define __NR_sched_setaffinity 203
472__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
473#define __NR_sched_getaffinity 204
474__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
475#define __NR_set_thread_area 205
476__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */
477#define __NR_io_setup 206
478__SYSCALL(__NR_io_setup, sys_io_setup)
479#define __NR_io_destroy 207
480__SYSCALL(__NR_io_destroy, sys_io_destroy)
481#define __NR_io_getevents 208
482__SYSCALL(__NR_io_getevents, sys_io_getevents)
483#define __NR_io_submit 209
484__SYSCALL(__NR_io_submit, sys_io_submit)
485#define __NR_io_cancel 210
486__SYSCALL(__NR_io_cancel, sys_io_cancel)
487#define __NR_get_thread_area 211
488__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */
489#define __NR_lookup_dcookie 212
490__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
491#define __NR_epoll_create 213
492__SYSCALL(__NR_epoll_create, sys_epoll_create)
493#define __NR_epoll_ctl_old 214
494__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
495#define __NR_epoll_wait_old 215
496__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
497#define __NR_remap_file_pages 216
498__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
499#define __NR_getdents64 217
500__SYSCALL(__NR_getdents64, sys_getdents64)
501#define __NR_set_tid_address 218
502__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
503#define __NR_restart_syscall 219
504__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
505#define __NR_semtimedop 220
506__SYSCALL(__NR_semtimedop, sys_semtimedop)
507#define __NR_fadvise64 221
508__SYSCALL(__NR_fadvise64, sys_fadvise64)
509#define __NR_timer_create 222
510__SYSCALL(__NR_timer_create, sys_timer_create)
511#define __NR_timer_settime 223
512__SYSCALL(__NR_timer_settime, sys_timer_settime)
513#define __NR_timer_gettime 224
514__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
515#define __NR_timer_getoverrun 225
516__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
517#define __NR_timer_delete 226
518__SYSCALL(__NR_timer_delete, sys_timer_delete)
519#define __NR_clock_settime 227
520__SYSCALL(__NR_clock_settime, sys_clock_settime)
521#define __NR_clock_gettime 228
522__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
523#define __NR_clock_getres 229
524__SYSCALL(__NR_clock_getres, sys_clock_getres)
525#define __NR_clock_nanosleep 230
526__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
527#define __NR_exit_group 231
528__SYSCALL(__NR_exit_group, sys_exit_group)
529#define __NR_epoll_wait 232
530__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
531#define __NR_epoll_ctl 233
532__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
533#define __NR_tgkill 234
534__SYSCALL(__NR_tgkill, sys_tgkill)
535#define __NR_utimes 235
536__SYSCALL(__NR_utimes, sys_utimes)
537#define __NR_vserver 236
538__SYSCALL(__NR_vserver, sys_ni_syscall)
539#define __NR_mbind 237
540__SYSCALL(__NR_mbind, sys_mbind)
541#define __NR_set_mempolicy 238
542__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
543#define __NR_get_mempolicy 239
544__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
545#define __NR_mq_open 240
546__SYSCALL(__NR_mq_open, sys_mq_open)
547#define __NR_mq_unlink 241
548__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
549#define __NR_mq_timedsend 242
550__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
551#define __NR_mq_timedreceive 243
552__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
553#define __NR_mq_notify 244
554__SYSCALL(__NR_mq_notify, sys_mq_notify)
555#define __NR_mq_getsetattr 245
556__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
557#define __NR_kexec_load 246
558__SYSCALL(__NR_kexec_load, sys_kexec_load)
559#define __NR_waitid 247
560__SYSCALL(__NR_waitid, sys_waitid)
561#define __NR_add_key 248
562__SYSCALL(__NR_add_key, sys_add_key)
563#define __NR_request_key 249
564__SYSCALL(__NR_request_key, sys_request_key)
565#define __NR_keyctl 250
566__SYSCALL(__NR_keyctl, sys_keyctl)
567#define __NR_ioprio_set 251
568__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
569#define __NR_ioprio_get 252
570__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
571#define __NR_inotify_init 253
572__SYSCALL(__NR_inotify_init, sys_inotify_init)
573#define __NR_inotify_add_watch 254
574__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
575#define __NR_inotify_rm_watch 255
576__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
577#define __NR_migrate_pages 256
578__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
579#define __NR_openat 257
580__SYSCALL(__NR_openat, sys_openat)
581#define __NR_mkdirat 258
582__SYSCALL(__NR_mkdirat, sys_mkdirat)
583#define __NR_mknodat 259
584__SYSCALL(__NR_mknodat, sys_mknodat)
585#define __NR_fchownat 260
586__SYSCALL(__NR_fchownat, sys_fchownat)
587#define __NR_futimesat 261
588__SYSCALL(__NR_futimesat, sys_futimesat)
589#define __NR_newfstatat 262
590__SYSCALL(__NR_newfstatat, sys_newfstatat)
591#define __NR_unlinkat 263
592__SYSCALL(__NR_unlinkat, sys_unlinkat)
593#define __NR_renameat 264
594__SYSCALL(__NR_renameat, sys_renameat)
595#define __NR_linkat 265
596__SYSCALL(__NR_linkat, sys_linkat)
597#define __NR_symlinkat 266
598__SYSCALL(__NR_symlinkat, sys_symlinkat)
599#define __NR_readlinkat 267
600__SYSCALL(__NR_readlinkat, sys_readlinkat)
601#define __NR_fchmodat 268
602__SYSCALL(__NR_fchmodat, sys_fchmodat)
603#define __NR_faccessat 269
604__SYSCALL(__NR_faccessat, sys_faccessat)
605#define __NR_pselect6 270
606__SYSCALL(__NR_pselect6, sys_pselect6)
607#define __NR_ppoll 271
608__SYSCALL(__NR_ppoll, sys_ppoll)
609#define __NR_unshare 272
610__SYSCALL(__NR_unshare, sys_unshare)
611#define __NR_set_robust_list 273
612__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
613#define __NR_get_robust_list 274
614__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
615#define __NR_splice 275
616__SYSCALL(__NR_splice, sys_splice)
617#define __NR_tee 276
618__SYSCALL(__NR_tee, sys_tee)
619#define __NR_sync_file_range 277
620__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
621#define __NR_vmsplice 278
622__SYSCALL(__NR_vmsplice, sys_vmsplice)
623#define __NR_move_pages 279
624__SYSCALL(__NR_move_pages, sys_move_pages)
625#define __NR_utimensat 280
626__SYSCALL(__NR_utimensat, sys_utimensat)
627#define __NR_epoll_pwait 281
628__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
629#define __NR_signalfd 282
630__SYSCALL(__NR_signalfd, sys_signalfd)
631#define __NR_timerfd_create 283
632__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
633#define __NR_eventfd 284
634__SYSCALL(__NR_eventfd, sys_eventfd)
635#define __NR_fallocate 285
636__SYSCALL(__NR_fallocate, sys_fallocate)
637#define __NR_timerfd_settime 286
638__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
639#define __NR_timerfd_gettime 287
640__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
641#define __NR_accept4 288
642__SYSCALL(__NR_accept4, sys_accept4)
643#define __NR_signalfd4 289
644__SYSCALL(__NR_signalfd4, sys_signalfd4)
645#define __NR_eventfd2 290
646__SYSCALL(__NR_eventfd2, sys_eventfd2)
647#define __NR_epoll_create1 291
648__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
649#define __NR_dup3 292
650__SYSCALL(__NR_dup3, sys_dup3)
651#define __NR_pipe2 293
652__SYSCALL(__NR_pipe2, sys_pipe2)
653#define __NR_inotify_init1 294
654__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
655#define __NR_preadv 295
656__SYSCALL(__NR_preadv, sys_preadv)
657#define __NR_pwritev 296
658__SYSCALL(__NR_pwritev, sys_pwritev)
659#define __NR_rt_tgsigqueueinfo 297
660__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
661#define __NR_perf_event_open 298
662__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
663#define __NR_recvmmsg 299
664__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
665#define __NR_fanotify_init 300
666__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
667#define __NR_fanotify_mark 301
668__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
669#define __NR_prlimit64 302
670__SYSCALL(__NR_prlimit64, sys_prlimit64)
671#define __NR_name_to_handle_at 303
672__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
673#define __NR_open_by_handle_at 304
674__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
675#define __NR_clock_adjtime 305
676__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
677#define __NR_syncfs 306
678__SYSCALL(__NR_syncfs, sys_syncfs)
679#define __NR_sendmmsg 307
680__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
681#define __NR_setns 308
682__SYSCALL(__NR_setns, sys_setns)
683#define __NR_getcpu 309
684__SYSCALL(__NR_getcpu, sys_getcpu)
685#define __NR_process_vm_readv 310
686__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
687#define __NR_process_vm_writev 311
688__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
689
690#ifndef __NO_STUBS
691#define __ARCH_WANT_OLD_READDIR
692#define __ARCH_WANT_OLD_STAT
693#define __ARCH_WANT_SYS_ALARM
694#define __ARCH_WANT_SYS_GETHOSTNAME
695#define __ARCH_WANT_SYS_PAUSE
696#define __ARCH_WANT_SYS_SGETMASK
697#define __ARCH_WANT_SYS_SIGNAL
698#define __ARCH_WANT_SYS_UTIME
699#define __ARCH_WANT_SYS_WAITPID
700#define __ARCH_WANT_SYS_SOCKETCALL
701#define __ARCH_WANT_SYS_FADVISE64
702#define __ARCH_WANT_SYS_GETPGRP
703#define __ARCH_WANT_SYS_LLSEEK
704#define __ARCH_WANT_SYS_NICE
705#define __ARCH_WANT_SYS_OLD_GETRLIMIT
706#define __ARCH_WANT_SYS_OLD_UNAME
707#define __ARCH_WANT_SYS_OLDUMOUNT
708#define __ARCH_WANT_SYS_SIGPENDING
709#define __ARCH_WANT_SYS_SIGPROCMASK
710#define __ARCH_WANT_SYS_RT_SIGACTION
711#define __ARCH_WANT_SYS_RT_SIGSUSPEND
712#define __ARCH_WANT_SYS_TIME
713#define __ARCH_WANT_COMPAT_SYS_TIME
714#endif /* __NO_STUBS */
715
716#ifdef __KERNEL__
717
718#ifndef COMPILE_OFFSETS
719#include <asm/asm-offsets.h>
720#define NR_syscalls (__NR_syscall_max + 1)
721#endif
722
723/*
724 * "Conditional" syscalls
725 *
726 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
727 * but it doesn't work on all toolchains, so we just do it by hand
728 */
729#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
730#endif /* __KERNEL__ */
731
732#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 1ac860a09849..517d4767ffdd 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -179,6 +179,7 @@ struct x86_msi_ops {
179 int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); 179 int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
180 void (*teardown_msi_irq)(unsigned int irq); 180 void (*teardown_msi_irq)(unsigned int irq);
181 void (*teardown_msi_irqs)(struct pci_dev *dev); 181 void (*teardown_msi_irqs)(struct pci_dev *dev);
182 void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
182}; 183};
183 184
184extern struct x86_init_ops x86_init; 185extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871c..5369059c07a9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
25obj-y += probe_roms.o 25obj-y += probe_roms.o
26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
28obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 28obj-y += syscall_$(BITS).o
29obj-$(CONFIG_X86_64) += vsyscall_64.o
29obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 30obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
30obj-y += bootflag.o e820.o 31obj-y += bootflag.o e820.o
31obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 32obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
@@ -80,6 +81,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o
80obj-$(CONFIG_AMD_NB) += amd_nb.o 81obj-$(CONFIG_AMD_NB) += amd_nb.o
81obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 82obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
82obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 83obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
84obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
83 85
84obj-$(CONFIG_KVM_GUEST) += kvm.o 86obj-$(CONFIG_KVM_GUEST) += kvm.o
85obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 87obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 013c1810ce72..be16854591cc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device)
119 return false; 119 return false;
120} 120}
121 121
122struct resource *amd_get_mmconfig_range(struct resource *res)
123{
124 u32 address;
125 u64 base, msr;
126 unsigned segn_busn_bits;
127
128 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
129 return NULL;
130
131 /* assume all cpus from fam10h have mmconfig */
132 if (boot_cpu_data.x86 < 0x10)
133 return NULL;
134
135 address = MSR_FAM10H_MMIO_CONF_BASE;
136 rdmsrl(address, msr);
137
138 /* mmconfig is not enabled */
139 if (!(msr & FAM10H_MMIO_CONF_ENABLE))
140 return NULL;
141
142 base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
143
144 segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
145 FAM10H_MMIO_CONF_BUSRANGE_MASK;
146
147 res->flags = IORESOURCE_MEM;
148 res->start = base;
149 res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
150 return res;
151}
152
122int amd_get_subcaches(int cpu) 153int amd_get_subcaches(int cpu)
123{ 154{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; 155 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a46bd383953c..f76623cbe263 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -383,21 +383,21 @@ static int ignore_sys_suspend;
383static int ignore_normal_resume; 383static int ignore_normal_resume;
384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
385 385
386static int debug __read_mostly; 386static bool debug __read_mostly;
387static int smp __read_mostly; 387static bool smp __read_mostly;
388static int apm_disabled = -1; 388static int apm_disabled = -1;
389#ifdef CONFIG_SMP 389#ifdef CONFIG_SMP
390static int power_off; 390static bool power_off;
391#else 391#else
392static int power_off = 1; 392static bool power_off = 1;
393#endif 393#endif
394static int realmode_power_off; 394static bool realmode_power_off;
395#ifdef CONFIG_APM_ALLOW_INTS 395#ifdef CONFIG_APM_ALLOW_INTS
396static int allow_ints = 1; 396static bool allow_ints = 1;
397#else 397#else
398static int allow_ints; 398static bool allow_ints;
399#endif 399#endif
400static int broken_psr; 400static bool broken_psr;
401 401
402static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 4f13fafc5264..68de2dc962ec 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,4 +67,6 @@ void common(void) {
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version); 68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); 69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70 OFFSET(BP_pref_address, boot_params, hdr.pref_address);
71 OFFSET(BP_code32_start, boot_params, hdr.code32_start);
70} 72}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 395a10e68067..85d98ab15cdc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -3,6 +3,11 @@
3#include <linux/lguest.h> 3#include <linux/lguest.h>
4#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
5 5
6#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
7static char syscalls[] = {
8#include <asm/syscalls_32.h>
9};
10
6/* workaround for a warning with -Wmissing-prototypes */ 11/* workaround for a warning with -Wmissing-prototypes */
7void foo(void); 12void foo(void);
8 13
@@ -76,4 +81,7 @@ void foo(void)
76 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 81 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
77 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 82 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
78#endif 83#endif
84 BLANK();
85 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
86 DEFINE(NR_syscalls, sizeof(syscalls));
79} 87}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72a1194af22..834e897b1e25 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,11 +1,12 @@
1#include <asm/ia32.h> 1#include <asm/ia32.h>
2 2
3#define __NO_STUBS 1 3#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
4#undef __SYSCALL 4static char syscalls_64[] = {
5#undef _ASM_X86_UNISTD_64_H 5#include <asm/syscalls_64.h>
6#define __SYSCALL(nr, sym) [nr] = 1, 6};
7static char syscalls[] = { 7#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
8#include <asm/unistd.h> 8static char syscalls_ia32[] = {
9#include <asm/syscalls_32.h>
9}; 10};
10 11
11int main(void) 12int main(void)
@@ -72,7 +73,11 @@ int main(void)
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 73 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
73 BLANK(); 74 BLANK();
74 75
75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); 76 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
77 DEFINE(NR_syscalls, sizeof(syscalls_64));
78
79 DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
80 DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
76 81
77 return 0; 82 return 0;
78} 83}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 850f2963a420..d43cad74f166 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1021,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid);
1021 1021
1022#ifdef CONFIG_X86_64 1022#ifdef CONFIG_X86_64
1023struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; 1023struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
1024struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
1025 (unsigned long) nmi_idt_table };
1024 1026
1025DEFINE_PER_CPU_FIRST(union irq_stack_union, 1027DEFINE_PER_CPU_FIRST(union irq_stack_union,
1026 irq_stack_union) __aligned(PAGE_SIZE); 1028 irq_stack_union) __aligned(PAGE_SIZE);
@@ -1085,6 +1087,26 @@ unsigned long kernel_eflags;
1085 */ 1087 */
1086DEFINE_PER_CPU(struct orig_ist, orig_ist); 1088DEFINE_PER_CPU(struct orig_ist, orig_ist);
1087 1089
1090static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
1091DEFINE_PER_CPU(int, debug_stack_usage);
1092
1093int is_debug_stack(unsigned long addr)
1094{
1095 return __get_cpu_var(debug_stack_usage) ||
1096 (addr <= __get_cpu_var(debug_stack_addr) &&
1097 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
1098}
1099
1100void debug_stack_set_zero(void)
1101{
1102 load_idt((const struct desc_ptr *)&nmi_idt_descr);
1103}
1104
1105void debug_stack_reset(void)
1106{
1107 load_idt((const struct desc_ptr *)&idt_descr);
1108}
1109
1088#else /* CONFIG_X86_64 */ 1110#else /* CONFIG_X86_64 */
1089 1111
1090DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1112DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
@@ -1212,6 +1234,8 @@ void __cpuinit cpu_init(void)
1212 estacks += exception_stack_sizes[v]; 1234 estacks += exception_stack_sizes[v];
1213 oist->ist[v] = t->x86_tss.ist[v] = 1235 oist->ist[v] = t->x86_tss.ist[v] =
1214 (unsigned long)estacks; 1236 (unsigned long)estacks;
1237 if (v == DEBUG_STACK-1)
1238 per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
1215 } 1239 }
1216 } 1240 }
1217 1241
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811693c9..6b45e5e7a901 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
844 844
845#include <linux/kobject.h> 845#include <linux/kobject.h>
846#include <linux/sysfs.h> 846#include <linux/sysfs.h>
847 847#include <linux/cpu.h>
848extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
849 848
850/* pointer to kobject for cpuX/cache */ 849/* pointer to kobject for cpuX/cache */
851static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); 850static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1072,9 @@ err_out:
1073static DECLARE_BITMAP(cache_dev_map, NR_CPUS); 1072static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
1074 1073
1075/* Add/Remove cache interface for CPU device */ 1074/* Add/Remove cache interface for CPU device */
1076static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 1075static int __cpuinit cache_add_dev(struct device *dev)
1077{ 1076{
1078 unsigned int cpu = sys_dev->id; 1077 unsigned int cpu = dev->id;
1079 unsigned long i, j; 1078 unsigned long i, j;
1080 struct _index_kobject *this_object; 1079 struct _index_kobject *this_object;
1081 struct _cpuid4_info *this_leaf; 1080 struct _cpuid4_info *this_leaf;
@@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1087 1086
1088 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), 1087 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
1089 &ktype_percpu_entry, 1088 &ktype_percpu_entry,
1090 &sys_dev->kobj, "%s", "cache"); 1089 &dev->kobj, "%s", "cache");
1091 if (retval < 0) { 1090 if (retval < 0) {
1092 cpuid4_cache_sysfs_exit(cpu); 1091 cpuid4_cache_sysfs_exit(cpu);
1093 return retval; 1092 return retval;
@@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1124 return 0; 1123 return 0;
1125} 1124}
1126 1125
1127static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 1126static void __cpuinit cache_remove_dev(struct device *dev)
1128{ 1127{
1129 unsigned int cpu = sys_dev->id; 1128 unsigned int cpu = dev->id;
1130 unsigned long i; 1129 unsigned long i;
1131 1130
1132 if (per_cpu(ici_cpuid4_info, cpu) == NULL) 1131 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
1145 unsigned long action, void *hcpu) 1144 unsigned long action, void *hcpu)
1146{ 1145{
1147 unsigned int cpu = (unsigned long)hcpu; 1146 unsigned int cpu = (unsigned long)hcpu;
1148 struct sys_device *sys_dev; 1147 struct device *dev;
1149 1148
1150 sys_dev = get_cpu_sysdev(cpu); 1149 dev = get_cpu_device(cpu);
1151 switch (action) { 1150 switch (action) {
1152 case CPU_ONLINE: 1151 case CPU_ONLINE:
1153 case CPU_ONLINE_FROZEN: 1152 case CPU_ONLINE_FROZEN:
1154 cache_add_dev(sys_dev); 1153 cache_add_dev(dev);
1155 break; 1154 break;
1156 case CPU_DEAD: 1155 case CPU_DEAD:
1157 case CPU_DEAD_FROZEN: 1156 case CPU_DEAD_FROZEN:
1158 cache_remove_dev(sys_dev); 1157 cache_remove_dev(dev);
1159 break; 1158 break;
1160 } 1159 }
1161 return NOTIFY_OK; 1160 return NOTIFY_OK;
@@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)
1174 1173
1175 for_each_online_cpu(i) { 1174 for_each_online_cpu(i) {
1176 int err; 1175 int err;
1177 struct sys_device *sys_dev = get_cpu_sysdev(i); 1176 struct device *dev = get_cpu_device(i);
1178 1177
1179 err = cache_add_dev(sys_dev); 1178 err = cache_add_dev(dev);
1180 if (err) 1179 if (err)
1181 return err; 1180 return err;
1182 } 1181 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69ee8b5..ed44c8a65858 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
1#include <linux/sysdev.h> 1#include <linux/device.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4enum severity_level { 4enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
17struct mce_bank { 17struct mce_bank {
18 u64 ctl; /* subevents to enable */ 18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */ 19 unsigned char init; /* initialise bank? */
20 struct sysdev_attribute attr; /* sysdev attribute */ 20 struct device_attribute attr; /* device attribute */
21 char attrname[ATTR_LEN]; /* attribute name */ 21 char attrname[ATTR_LEN]; /* attribute name */
22}; 22};
23 23
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index cbe82b5918ce..5a11ae2e9e91 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/string.h> 21#include <linux/string.h>
22#include <linux/sysdev.h> 22#include <linux/device.h>
23#include <linux/syscore_ops.h> 23#include <linux/syscore_ops.h>
24#include <linux/delay.h> 24#include <linux/delay.h>
25#include <linux/ctype.h> 25#include <linux/ctype.h>
@@ -1818,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = {
1818}; 1818};
1819 1819
1820/* 1820/*
1821 * mce_sysdev: Sysfs support 1821 * mce_device: Sysfs support
1822 */ 1822 */
1823 1823
1824static void mce_cpu_restart(void *data) 1824static void mce_cpu_restart(void *data)
@@ -1854,27 +1854,28 @@ static void mce_enable_ce(void *all)
1854 __mcheck_cpu_init_timer(); 1854 __mcheck_cpu_init_timer();
1855} 1855}
1856 1856
1857static struct sysdev_class mce_sysdev_class = { 1857static struct bus_type mce_subsys = {
1858 .name = "machinecheck", 1858 .name = "machinecheck",
1859 .dev_name = "machinecheck",
1859}; 1860};
1860 1861
1861DEFINE_PER_CPU(struct sys_device, mce_sysdev); 1862struct device *mce_device[CONFIG_NR_CPUS];
1862 1863
1863__cpuinitdata 1864__cpuinitdata
1864void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1865void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1865 1866
1866static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1867static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
1867{ 1868{
1868 return container_of(attr, struct mce_bank, attr); 1869 return container_of(attr, struct mce_bank, attr);
1869} 1870}
1870 1871
1871static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1872static ssize_t show_bank(struct device *s, struct device_attribute *attr,
1872 char *buf) 1873 char *buf)
1873{ 1874{
1874 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1875 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1875} 1876}
1876 1877
1877static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1878static ssize_t set_bank(struct device *s, struct device_attribute *attr,
1878 const char *buf, size_t size) 1879 const char *buf, size_t size)
1879{ 1880{
1880 u64 new; 1881 u64 new;
@@ -1889,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1889} 1890}
1890 1891
1891static ssize_t 1892static ssize_t
1892show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1893show_trigger(struct device *s, struct device_attribute *attr, char *buf)
1893{ 1894{
1894 strcpy(buf, mce_helper); 1895 strcpy(buf, mce_helper);
1895 strcat(buf, "\n"); 1896 strcat(buf, "\n");
1896 return strlen(mce_helper) + 1; 1897 return strlen(mce_helper) + 1;
1897} 1898}
1898 1899
1899static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1900static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
1900 const char *buf, size_t siz) 1901 const char *buf, size_t siz)
1901{ 1902{
1902 char *p; 1903 char *p;
@@ -1911,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1911 return strlen(mce_helper) + !!p; 1912 return strlen(mce_helper) + !!p;
1912} 1913}
1913 1914
1914static ssize_t set_ignore_ce(struct sys_device *s, 1915static ssize_t set_ignore_ce(struct device *s,
1915 struct sysdev_attribute *attr, 1916 struct device_attribute *attr,
1916 const char *buf, size_t size) 1917 const char *buf, size_t size)
1917{ 1918{
1918 u64 new; 1919 u64 new;
@@ -1935,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
1935 return size; 1936 return size;
1936} 1937}
1937 1938
1938static ssize_t set_cmci_disabled(struct sys_device *s, 1939static ssize_t set_cmci_disabled(struct device *s,
1939 struct sysdev_attribute *attr, 1940 struct device_attribute *attr,
1940 const char *buf, size_t size) 1941 const char *buf, size_t size)
1941{ 1942{
1942 u64 new; 1943 u64 new;
@@ -1958,108 +1959,117 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
1958 return size; 1959 return size;
1959} 1960}
1960 1961
1961static ssize_t store_int_with_restart(struct sys_device *s, 1962static ssize_t store_int_with_restart(struct device *s,
1962 struct sysdev_attribute *attr, 1963 struct device_attribute *attr,
1963 const char *buf, size_t size) 1964 const char *buf, size_t size)
1964{ 1965{
1965 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1966 ssize_t ret = device_store_int(s, attr, buf, size);
1966 mce_restart(); 1967 mce_restart();
1967 return ret; 1968 return ret;
1968} 1969}
1969 1970
1970static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1971static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
1971static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1972static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
1972static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1973static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1973static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1974static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1974 1975
1975static struct sysdev_ext_attribute attr_check_interval = { 1976static struct dev_ext_attribute dev_attr_check_interval = {
1976 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1977 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
1977 store_int_with_restart),
1978 &check_interval 1978 &check_interval
1979}; 1979};
1980 1980
1981static struct sysdev_ext_attribute attr_ignore_ce = { 1981static struct dev_ext_attribute dev_attr_ignore_ce = {
1982 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1982 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
1983 &mce_ignore_ce 1983 &mce_ignore_ce
1984}; 1984};
1985 1985
1986static struct sysdev_ext_attribute attr_cmci_disabled = { 1986static struct dev_ext_attribute dev_attr_cmci_disabled = {
1987 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1987 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
1988 &mce_cmci_disabled 1988 &mce_cmci_disabled
1989}; 1989};
1990 1990
1991static struct sysdev_attribute *mce_sysdev_attrs[] = { 1991static struct device_attribute *mce_device_attrs[] = {
1992 &attr_tolerant.attr, 1992 &dev_attr_tolerant.attr,
1993 &attr_check_interval.attr, 1993 &dev_attr_check_interval.attr,
1994 &attr_trigger, 1994 &dev_attr_trigger,
1995 &attr_monarch_timeout.attr, 1995 &dev_attr_monarch_timeout.attr,
1996 &attr_dont_log_ce.attr, 1996 &dev_attr_dont_log_ce.attr,
1997 &attr_ignore_ce.attr, 1997 &dev_attr_ignore_ce.attr,
1998 &attr_cmci_disabled.attr, 1998 &dev_attr_cmci_disabled.attr,
1999 NULL 1999 NULL
2000}; 2000};
2001 2001
2002static cpumask_var_t mce_sysdev_initialized; 2002static cpumask_var_t mce_device_initialized;
2003 2003
2004/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 2004static void mce_device_release(struct device *dev)
2005static __cpuinit int mce_sysdev_create(unsigned int cpu)
2006{ 2005{
2007 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 2006 kfree(dev);
2007}
2008
2009/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2010static __cpuinit int mce_device_create(unsigned int cpu)
2011{
2012 struct device *dev;
2008 int err; 2013 int err;
2009 int i, j; 2014 int i, j;
2010 2015
2011 if (!mce_available(&boot_cpu_data)) 2016 if (!mce_available(&boot_cpu_data))
2012 return -EIO; 2017 return -EIO;
2013 2018
2014 memset(&sysdev->kobj, 0, sizeof(struct kobject)); 2019 dev = kzalloc(sizeof *dev, GFP_KERNEL);
2015 sysdev->id = cpu; 2020 if (!dev)
2016 sysdev->cls = &mce_sysdev_class; 2021 return -ENOMEM;
2022 dev->id = cpu;
2023 dev->bus = &mce_subsys;
2024 dev->release = &mce_device_release;
2017 2025
2018 err = sysdev_register(sysdev); 2026 err = device_register(dev);
2019 if (err) 2027 if (err)
2020 return err; 2028 return err;
2021 2029
2022 for (i = 0; mce_sysdev_attrs[i]; i++) { 2030 for (i = 0; mce_device_attrs[i]; i++) {
2023 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); 2031 err = device_create_file(dev, mce_device_attrs[i]);
2024 if (err) 2032 if (err)
2025 goto error; 2033 goto error;
2026 } 2034 }
2027 for (j = 0; j < banks; j++) { 2035 for (j = 0; j < banks; j++) {
2028 err = sysdev_create_file(sysdev, &mce_banks[j].attr); 2036 err = device_create_file(dev, &mce_banks[j].attr);
2029 if (err) 2037 if (err)
2030 goto error2; 2038 goto error2;
2031 } 2039 }
2032 cpumask_set_cpu(cpu, mce_sysdev_initialized); 2040 cpumask_set_cpu(cpu, mce_device_initialized);
2041 mce_device[cpu] = dev;
2033 2042
2034 return 0; 2043 return 0;
2035error2: 2044error2:
2036 while (--j >= 0) 2045 while (--j >= 0)
2037 sysdev_remove_file(sysdev, &mce_banks[j].attr); 2046 device_remove_file(dev, &mce_banks[j].attr);
2038error: 2047error:
2039 while (--i >= 0) 2048 while (--i >= 0)
2040 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 2049 device_remove_file(dev, mce_device_attrs[i]);
2041 2050
2042 sysdev_unregister(sysdev); 2051 device_unregister(dev);
2043 2052
2044 return err; 2053 return err;
2045} 2054}
2046 2055
2047static __cpuinit void mce_sysdev_remove(unsigned int cpu) 2056static __cpuinit void mce_device_remove(unsigned int cpu)
2048{ 2057{
2049 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 2058 struct device *dev = mce_device[cpu];
2050 int i; 2059 int i;
2051 2060
2052 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) 2061 if (!cpumask_test_cpu(cpu, mce_device_initialized))
2053 return; 2062 return;
2054 2063
2055 for (i = 0; mce_sysdev_attrs[i]; i++) 2064 for (i = 0; mce_device_attrs[i]; i++)
2056 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 2065 device_remove_file(dev, mce_device_attrs[i]);
2057 2066
2058 for (i = 0; i < banks; i++) 2067 for (i = 0; i < banks; i++)
2059 sysdev_remove_file(sysdev, &mce_banks[i].attr); 2068 device_remove_file(dev, &mce_banks[i].attr);
2060 2069
2061 sysdev_unregister(sysdev); 2070 device_unregister(dev);
2062 cpumask_clear_cpu(cpu, mce_sysdev_initialized); 2071 cpumask_clear_cpu(cpu, mce_device_initialized);
2072 mce_device[cpu] = NULL;
2063} 2073}
2064 2074
2065/* Make sure there are no machine checks on offlined CPUs. */ 2075/* Make sure there are no machine checks on offlined CPUs. */
@@ -2109,7 +2119,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2109 switch (action) { 2119 switch (action) {
2110 case CPU_ONLINE: 2120 case CPU_ONLINE:
2111 case CPU_ONLINE_FROZEN: 2121 case CPU_ONLINE_FROZEN:
2112 mce_sysdev_create(cpu); 2122 mce_device_create(cpu);
2113 if (threshold_cpu_callback) 2123 if (threshold_cpu_callback)
2114 threshold_cpu_callback(action, cpu); 2124 threshold_cpu_callback(action, cpu);
2115 break; 2125 break;
@@ -2117,7 +2127,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2117 case CPU_DEAD_FROZEN: 2127 case CPU_DEAD_FROZEN:
2118 if (threshold_cpu_callback) 2128 if (threshold_cpu_callback)
2119 threshold_cpu_callback(action, cpu); 2129 threshold_cpu_callback(action, cpu);
2120 mce_sysdev_remove(cpu); 2130 mce_device_remove(cpu);
2121 break; 2131 break;
2122 case CPU_DOWN_PREPARE: 2132 case CPU_DOWN_PREPARE:
2123 case CPU_DOWN_PREPARE_FROZEN: 2133 case CPU_DOWN_PREPARE_FROZEN:
@@ -2151,7 +2161,7 @@ static __init void mce_init_banks(void)
2151 2161
2152 for (i = 0; i < banks; i++) { 2162 for (i = 0; i < banks; i++) {
2153 struct mce_bank *b = &mce_banks[i]; 2163 struct mce_bank *b = &mce_banks[i];
2154 struct sysdev_attribute *a = &b->attr; 2164 struct device_attribute *a = &b->attr;
2155 2165
2156 sysfs_attr_init(&a->attr); 2166 sysfs_attr_init(&a->attr);
2157 a->attr.name = b->attrname; 2167 a->attr.name = b->attrname;
@@ -2171,16 +2181,16 @@ static __init int mcheck_init_device(void)
2171 if (!mce_available(&boot_cpu_data)) 2181 if (!mce_available(&boot_cpu_data))
2172 return -EIO; 2182 return -EIO;
2173 2183
2174 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); 2184 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
2175 2185
2176 mce_init_banks(); 2186 mce_init_banks();
2177 2187
2178 err = sysdev_class_register(&mce_sysdev_class); 2188 err = subsys_system_register(&mce_subsys, NULL);
2179 if (err) 2189 if (err)
2180 return err; 2190 return err;
2181 2191
2182 for_each_online_cpu(i) { 2192 for_each_online_cpu(i) {
2183 err = mce_sysdev_create(i); 2193 err = mce_device_create(i);
2184 if (err) 2194 if (err)
2185 return err; 2195 return err;
2186 } 2196 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 1d76872b6a45..786e76a86322 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
17#include <linux/notifier.h> 17#include <linux/notifier.h>
18#include <linux/kobject.h> 18#include <linux/kobject.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/sysdev.h>
21#include <linux/errno.h> 20#include <linux/errno.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
23#include <linux/sysfs.h> 22#include <linux/sysfs.h>
@@ -524,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
524{ 523{
525 int i, err = 0; 524 int i, err = 0;
526 struct threshold_bank *b = NULL; 525 struct threshold_bank *b = NULL;
526 struct device *dev = mce_device[cpu];
527 char name[32]; 527 char name[32];
528 528
529 sprintf(name, "threshold_bank%i", bank); 529 sprintf(name, "threshold_bank%i", bank);
@@ -544,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
544 if (!b) 544 if (!b)
545 goto out; 545 goto out;
546 546
547 err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, 547 err = sysfs_create_link(&dev->kobj, b->kobj, name);
548 b->kobj, name);
549 if (err) 548 if (err)
550 goto out; 549 goto out;
551 550
@@ -566,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
566 goto out; 565 goto out;
567 } 566 }
568 567
569 b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); 568 b->kobj = kobject_create_and_add(name, &dev->kobj);
570 if (!b->kobj) 569 if (!b->kobj)
571 goto out_free; 570 goto out_free;
572 571
@@ -586,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
586 if (i == cpu) 585 if (i == cpu)
587 continue; 586 continue;
588 587
589 err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, 588 dev = mce_device[i];
590 b->kobj, name); 589 if (dev)
590 err = sysfs_create_link(&dev->kobj,b->kobj, name);
591 if (err) 591 if (err)
592 goto out; 592 goto out;
593 593
@@ -650,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu,
650static void threshold_remove_bank(unsigned int cpu, int bank) 650static void threshold_remove_bank(unsigned int cpu, int bank)
651{ 651{
652 struct threshold_bank *b; 652 struct threshold_bank *b;
653 struct device *dev;
653 char name[32]; 654 char name[32];
654 int i = 0; 655 int i = 0;
655 656
@@ -664,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
664#ifdef CONFIG_SMP 665#ifdef CONFIG_SMP
665 /* sibling symlink */ 666 /* sibling symlink */
666 if (shared_bank[bank] && b->blocks->cpu != cpu) { 667 if (shared_bank[bank] && b->blocks->cpu != cpu) {
667 sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); 668 sysfs_remove_link(&mce_device[cpu]->kobj, name);
668 per_cpu(threshold_banks, cpu)[bank] = NULL; 669 per_cpu(threshold_banks, cpu)[bank] = NULL;
669 670
670 return; 671 return;
@@ -676,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
676 if (i == cpu) 677 if (i == cpu)
677 continue; 678 continue;
678 679
679 sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); 680 dev = mce_device[i];
681 if (dev)
682 sysfs_remove_link(&dev->kobj, name);
680 per_cpu(threshold_banks, i)[bank] = NULL; 683 per_cpu(threshold_banks, i)[bank] = NULL;
681 } 684 }
682 685
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 39c6089891e4..67bb17a37a0a 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/sysdev.h>
23#include <linux/types.h> 22#include <linux/types.h>
24#include <linux/init.h> 23#include <linux/init.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0);
69static u32 lvtthmr_init __read_mostly; 68static u32 lvtthmr_init __read_mostly;
70 69
71#ifdef CONFIG_SYSFS 70#ifdef CONFIG_SYSFS
72#define define_therm_throt_sysdev_one_ro(_name) \ 71#define define_therm_throt_device_one_ro(_name) \
73 static SYSDEV_ATTR(_name, 0444, \ 72 static DEVICE_ATTR(_name, 0444, \
74 therm_throt_sysdev_show_##_name, \ 73 therm_throt_device_show_##_name, \
75 NULL) \ 74 NULL) \
76 75
77#define define_therm_throt_sysdev_show_func(event, name) \ 76#define define_therm_throt_device_show_func(event, name) \
78 \ 77 \
79static ssize_t therm_throt_sysdev_show_##event##_##name( \ 78static ssize_t therm_throt_device_show_##event##_##name( \
80 struct sys_device *dev, \ 79 struct device *dev, \
81 struct sysdev_attribute *attr, \ 80 struct device_attribute *attr, \
82 char *buf) \ 81 char *buf) \
83{ \ 82{ \
84 unsigned int cpu = dev->id; \ 83 unsigned int cpu = dev->id; \
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \
95 return ret; \ 94 return ret; \
96} 95}
97 96
98define_therm_throt_sysdev_show_func(core_throttle, count); 97define_therm_throt_device_show_func(core_throttle, count);
99define_therm_throt_sysdev_one_ro(core_throttle_count); 98define_therm_throt_device_one_ro(core_throttle_count);
100 99
101define_therm_throt_sysdev_show_func(core_power_limit, count); 100define_therm_throt_device_show_func(core_power_limit, count);
102define_therm_throt_sysdev_one_ro(core_power_limit_count); 101define_therm_throt_device_one_ro(core_power_limit_count);
103 102
104define_therm_throt_sysdev_show_func(package_throttle, count); 103define_therm_throt_device_show_func(package_throttle, count);
105define_therm_throt_sysdev_one_ro(package_throttle_count); 104define_therm_throt_device_one_ro(package_throttle_count);
106 105
107define_therm_throt_sysdev_show_func(package_power_limit, count); 106define_therm_throt_device_show_func(package_power_limit, count);
108define_therm_throt_sysdev_one_ro(package_power_limit_count); 107define_therm_throt_device_one_ro(package_power_limit_count);
109 108
110static struct attribute *thermal_throttle_attrs[] = { 109static struct attribute *thermal_throttle_attrs[] = {
111 &attr_core_throttle_count.attr, 110 &dev_attr_core_throttle_count.attr,
112 NULL 111 NULL
113}; 112};
114 113
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
223 222
224#ifdef CONFIG_SYSFS 223#ifdef CONFIG_SYSFS
225/* Add/Remove thermal_throttle interface for CPU device: */ 224/* Add/Remove thermal_throttle interface for CPU device: */
226static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, 225static __cpuinit int thermal_throttle_add_dev(struct device *dev,
227 unsigned int cpu) 226 unsigned int cpu)
228{ 227{
229 int err; 228 int err;
230 struct cpuinfo_x86 *c = &cpu_data(cpu); 229 struct cpuinfo_x86 *c = &cpu_data(cpu);
231 230
232 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); 231 err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
233 if (err) 232 if (err)
234 return err; 233 return err;
235 234
236 if (cpu_has(c, X86_FEATURE_PLN)) 235 if (cpu_has(c, X86_FEATURE_PLN))
237 err = sysfs_add_file_to_group(&sys_dev->kobj, 236 err = sysfs_add_file_to_group(&dev->kobj,
238 &attr_core_power_limit_count.attr, 237 &dev_attr_core_power_limit_count.attr,
239 thermal_attr_group.name); 238 thermal_attr_group.name);
240 if (cpu_has(c, X86_FEATURE_PTS)) { 239 if (cpu_has(c, X86_FEATURE_PTS)) {
241 err = sysfs_add_file_to_group(&sys_dev->kobj, 240 err = sysfs_add_file_to_group(&dev->kobj,
242 &attr_package_throttle_count.attr, 241 &dev_attr_package_throttle_count.attr,
243 thermal_attr_group.name); 242 thermal_attr_group.name);
244 if (cpu_has(c, X86_FEATURE_PLN)) 243 if (cpu_has(c, X86_FEATURE_PLN))
245 err = sysfs_add_file_to_group(&sys_dev->kobj, 244 err = sysfs_add_file_to_group(&dev->kobj,
246 &attr_package_power_limit_count.attr, 245 &dev_attr_package_power_limit_count.attr,
247 thermal_attr_group.name); 246 thermal_attr_group.name);
248 } 247 }
249 248
250 return err; 249 return err;
251} 250}
252 251
253static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 252static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
254{ 253{
255 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); 254 sysfs_remove_group(&dev->kobj, &thermal_attr_group);
256} 255}
257 256
258/* Mutex protecting device creation against CPU hotplug: */ 257/* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
265 void *hcpu) 264 void *hcpu)
266{ 265{
267 unsigned int cpu = (unsigned long)hcpu; 266 unsigned int cpu = (unsigned long)hcpu;
268 struct sys_device *sys_dev; 267 struct device *dev;
269 int err = 0; 268 int err = 0;
270 269
271 sys_dev = get_cpu_sysdev(cpu); 270 dev = get_cpu_device(cpu);
272 271
273 switch (action) { 272 switch (action) {
274 case CPU_UP_PREPARE: 273 case CPU_UP_PREPARE:
275 case CPU_UP_PREPARE_FROZEN: 274 case CPU_UP_PREPARE_FROZEN:
276 mutex_lock(&therm_cpu_lock); 275 mutex_lock(&therm_cpu_lock);
277 err = thermal_throttle_add_dev(sys_dev, cpu); 276 err = thermal_throttle_add_dev(dev, cpu);
278 mutex_unlock(&therm_cpu_lock); 277 mutex_unlock(&therm_cpu_lock);
279 WARN_ON(err); 278 WARN_ON(err);
280 break; 279 break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
283 case CPU_DEAD: 282 case CPU_DEAD:
284 case CPU_DEAD_FROZEN: 283 case CPU_DEAD_FROZEN:
285 mutex_lock(&therm_cpu_lock); 284 mutex_lock(&therm_cpu_lock);
286 thermal_throttle_remove_dev(sys_dev); 285 thermal_throttle_remove_dev(dev);
287 mutex_unlock(&therm_cpu_lock); 286 mutex_unlock(&therm_cpu_lock);
288 break; 287 break;
289 } 288 }
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
310#endif 309#endif
311 /* connect live CPUs to sysfs */ 310 /* connect live CPUs to sysfs */
312 for_each_online_cpu(cpu) { 311 for_each_online_cpu(cpu) {
313 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); 312 err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
314 WARN_ON(err); 313 WARN_ON(err);
315 } 314 }
316#ifdef CONFIG_HOTPLUG_CPU 315#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 212a6a42527c..a524353d93f2 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
177 .notifier_call = cpuid_class_cpu_callback, 177 .notifier_call = cpuid_class_cpu_callback,
178}; 178};
179 179
180static char *cpuid_devnode(struct device *dev, mode_t *mode) 180static char *cpuid_devnode(struct device *dev, umode_t *mode)
181{ 181{
182 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); 182 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
183} 183}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 8071e2f3d6eb..62d61e9976eb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
19#include <linux/acpi.h> 19#include <linux/acpi.h>
20#include <linux/firmware-map.h> 20#include <linux/firmware-map.h>
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22#include <linux/sort.h>
22 23
23#include <asm/e820.h> 24#include <asm/e820.h>
24#include <asm/proto.h> 25#include <asm/proto.h>
@@ -227,22 +228,38 @@ void __init e820_print_map(char *who)
227 * ____________________33__ 228 * ____________________33__
228 * ______________________4_ 229 * ______________________4_
229 */ 230 */
231struct change_member {
232 struct e820entry *pbios; /* pointer to original bios entry */
233 unsigned long long addr; /* address for this change point */
234};
235
236static int __init cpcompare(const void *a, const void *b)
237{
238 struct change_member * const *app = a, * const *bpp = b;
239 const struct change_member *ap = *app, *bp = *bpp;
240
241 /*
242 * Inputs are pointers to two elements of change_point[]. If their
243 * addresses are unequal, their difference dominates. If the addresses
244 * are equal, then consider one that represents the end of its region
245 * to be greater than one that does not.
246 */
247 if (ap->addr != bp->addr)
248 return ap->addr > bp->addr ? 1 : -1;
249
250 return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
251}
230 252
231int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, 253int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
232 u32 *pnr_map) 254 u32 *pnr_map)
233{ 255{
234 struct change_member {
235 struct e820entry *pbios; /* pointer to original bios entry */
236 unsigned long long addr; /* address for this change point */
237 };
238 static struct change_member change_point_list[2*E820_X_MAX] __initdata; 256 static struct change_member change_point_list[2*E820_X_MAX] __initdata;
239 static struct change_member *change_point[2*E820_X_MAX] __initdata; 257 static struct change_member *change_point[2*E820_X_MAX] __initdata;
240 static struct e820entry *overlap_list[E820_X_MAX] __initdata; 258 static struct e820entry *overlap_list[E820_X_MAX] __initdata;
241 static struct e820entry new_bios[E820_X_MAX] __initdata; 259 static struct e820entry new_bios[E820_X_MAX] __initdata;
242 struct change_member *change_tmp;
243 unsigned long current_type, last_type; 260 unsigned long current_type, last_type;
244 unsigned long long last_addr; 261 unsigned long long last_addr;
245 int chgidx, still_changing; 262 int chgidx;
246 int overlap_entries; 263 int overlap_entries;
247 int new_bios_entry; 264 int new_bios_entry;
248 int old_nr, new_nr, chg_nr; 265 int old_nr, new_nr, chg_nr;
@@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
279 chg_nr = chgidx; 296 chg_nr = chgidx;
280 297
281 /* sort change-point list by memory addresses (low -> high) */ 298 /* sort change-point list by memory addresses (low -> high) */
282 still_changing = 1; 299 sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
283 while (still_changing) {
284 still_changing = 0;
285 for (i = 1; i < chg_nr; i++) {
286 unsigned long long curaddr, lastaddr;
287 unsigned long long curpbaddr, lastpbaddr;
288
289 curaddr = change_point[i]->addr;
290 lastaddr = change_point[i - 1]->addr;
291 curpbaddr = change_point[i]->pbios->addr;
292 lastpbaddr = change_point[i - 1]->pbios->addr;
293
294 /*
295 * swap entries, when:
296 *
297 * curaddr > lastaddr or
298 * curaddr == lastaddr and curaddr == curpbaddr and
299 * lastaddr != lastpbaddr
300 */
301 if (curaddr < lastaddr ||
302 (curaddr == lastaddr && curaddr == curpbaddr &&
303 lastaddr != lastpbaddr)) {
304 change_tmp = change_point[i];
305 change_point[i] = change_point[i-1];
306 change_point[i-1] = change_tmp;
307 still_changing = 1;
308 }
309 }
310 }
311 300
312 /* create a new bios memory map, removing overlaps */ 301 /* create a new bios memory map, removing overlaps */
313 overlap_entries = 0; /* number of entries in the overlap table */ 302 overlap_entries = 0; /* number of entries in the overlap table */
@@ -714,7 +703,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
714} 703}
715#endif 704#endif
716 705
717#ifdef CONFIG_HIBERNATION 706#ifdef CONFIG_ACPI
718/** 707/**
719 * Mark ACPI NVS memory region, so that we can save/restore it during 708 * Mark ACPI NVS memory region, so that we can save/restore it during
720 * hibernation and the subsequent resume. 709 * hibernation and the subsequent resume.
@@ -727,7 +716,7 @@ static int __init e820_mark_nvs_memory(void)
727 struct e820entry *ei = &e820.map[i]; 716 struct e820entry *ei = &e820.map[i];
728 717
729 if (ei->type == E820_NVS) 718 if (ei->type == E820_NVS)
730 suspend_nvs_register(ei->addr, ei->size); 719 acpi_nvs_register(ei->addr, ei->size);
731 } 720 }
732 721
733 return 0; 722 return 0;
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f9..9b9f18b49918 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,14 +240,14 @@ static int __init setup_early_printk(char *buf)
240 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
241 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
242#endif 242#endif
243#ifdef CONFIG_EARLY_PRINTK_MRST 243#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
244 if (!strncmp(buf, "mrst", 4)) { 244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init(); 245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep); 246 early_console_register(&early_mrst_console, keep);
247 } 247 }
248 248
249 if (!strncmp(buf, "hsu", 3)) { 249 if (!strncmp(buf, "hsu", 3)) {
250 hsu_early_console_init(); 250 hsu_early_console_init(buf + 3);
251 early_console_register(&early_hsu_console, keep); 251 early_console_register(&early_hsu_console, keep);
252 } 252 }
253#endif 253#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 22d0e21b4dd7..79d97e68f042 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
42 */ 42 */
43 43
44#include <linux/linkage.h> 44#include <linux/linkage.h>
45#include <linux/err.h>
45#include <asm/thread_info.h> 46#include <asm/thread_info.h>
46#include <asm/irqflags.h> 47#include <asm/irqflags.h>
47#include <asm/errno.h> 48#include <asm/errno.h>
@@ -81,8 +82,6 @@
81 * enough to patch inline, increasing performance. 82 * enough to patch inline, increasing performance.
82 */ 83 */
83 84
84#define nr_syscalls ((syscall_table_size)/4)
85
86#ifdef CONFIG_PREEMPT 85#ifdef CONFIG_PREEMPT
87#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 86#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
88#else 87#else
@@ -423,7 +422,7 @@ sysenter_past_esp:
423 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 422 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
424 jnz sysenter_audit 423 jnz sysenter_audit
425sysenter_do_call: 424sysenter_do_call:
426 cmpl $(nr_syscalls), %eax 425 cmpl $(NR_syscalls), %eax
427 jae syscall_badsys 426 jae syscall_badsys
428 call *sys_call_table(,%eax,4) 427 call *sys_call_table(,%eax,4)
429 movl %eax,PT_EAX(%esp) 428 movl %eax,PT_EAX(%esp)
@@ -455,7 +454,7 @@ sysenter_audit:
455 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ 454 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
456 movl %eax,%edx /* 2nd arg: syscall number */ 455 movl %eax,%edx /* 2nd arg: syscall number */
457 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 456 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
458 call audit_syscall_entry 457 call __audit_syscall_entry
459 pushl_cfi %ebx 458 pushl_cfi %ebx
460 movl PT_EAX(%esp),%eax /* reload syscall number */ 459 movl PT_EAX(%esp),%eax /* reload syscall number */
461 jmp sysenter_do_call 460 jmp sysenter_do_call
@@ -466,11 +465,10 @@ sysexit_audit:
466 TRACE_IRQS_ON 465 TRACE_IRQS_ON
467 ENABLE_INTERRUPTS(CLBR_ANY) 466 ENABLE_INTERRUPTS(CLBR_ANY)
468 movl %eax,%edx /* second arg, syscall return value */ 467 movl %eax,%edx /* second arg, syscall return value */
469 cmpl $0,%eax /* is it < 0? */ 468 cmpl $-MAX_ERRNO,%eax /* is it an error ? */
470 setl %al /* 1 if so, 0 if not */ 469 setbe %al /* 1 if so, 0 if not */
471 movzbl %al,%eax /* zero-extend that */ 470 movzbl %al,%eax /* zero-extend that */
472 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 471 call __audit_syscall_exit
473 call audit_syscall_exit
474 DISABLE_INTERRUPTS(CLBR_ANY) 472 DISABLE_INTERRUPTS(CLBR_ANY)
475 TRACE_IRQS_OFF 473 TRACE_IRQS_OFF
476 movl TI_flags(%ebp), %ecx 474 movl TI_flags(%ebp), %ecx
@@ -504,7 +502,7 @@ ENTRY(system_call)
504 # system call tracing in operation / emulation 502 # system call tracing in operation / emulation
505 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 503 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
506 jnz syscall_trace_entry 504 jnz syscall_trace_entry
507 cmpl $(nr_syscalls), %eax 505 cmpl $(NR_syscalls), %eax
508 jae syscall_badsys 506 jae syscall_badsys
509syscall_call: 507syscall_call:
510 call *sys_call_table(,%eax,4) 508 call *sys_call_table(,%eax,4)
@@ -654,7 +652,7 @@ syscall_trace_entry:
654 movl %esp, %eax 652 movl %esp, %eax
655 call syscall_trace_enter 653 call syscall_trace_enter
656 /* What it returned is what we'll actually use. */ 654 /* What it returned is what we'll actually use. */
657 cmpl $(nr_syscalls), %eax 655 cmpl $(NR_syscalls), %eax
658 jnae syscall_call 656 jnae syscall_call
659 jmp syscall_exit 657 jmp syscall_exit
660END(syscall_trace_entry) 658END(syscall_trace_entry)
@@ -694,29 +692,28 @@ END(syscall_badsys)
694 * System calls that need a pt_regs pointer. 692 * System calls that need a pt_regs pointer.
695 */ 693 */
696#define PTREGSCALL0(name) \ 694#define PTREGSCALL0(name) \
697 ALIGN; \ 695ENTRY(ptregs_##name) ; \
698ptregs_##name: \
699 leal 4(%esp),%eax; \ 696 leal 4(%esp),%eax; \
700 jmp sys_##name; 697 jmp sys_##name; \
698ENDPROC(ptregs_##name)
701 699
702#define PTREGSCALL1(name) \ 700#define PTREGSCALL1(name) \
703 ALIGN; \ 701ENTRY(ptregs_##name) ; \
704ptregs_##name: \
705 leal 4(%esp),%edx; \ 702 leal 4(%esp),%edx; \
706 movl (PT_EBX+4)(%esp),%eax; \ 703 movl (PT_EBX+4)(%esp),%eax; \
707 jmp sys_##name; 704 jmp sys_##name; \
705ENDPROC(ptregs_##name)
708 706
709#define PTREGSCALL2(name) \ 707#define PTREGSCALL2(name) \
710 ALIGN; \ 708ENTRY(ptregs_##name) ; \
711ptregs_##name: \
712 leal 4(%esp),%ecx; \ 709 leal 4(%esp),%ecx; \
713 movl (PT_ECX+4)(%esp),%edx; \ 710 movl (PT_ECX+4)(%esp),%edx; \
714 movl (PT_EBX+4)(%esp),%eax; \ 711 movl (PT_EBX+4)(%esp),%eax; \
715 jmp sys_##name; 712 jmp sys_##name; \
713ENDPROC(ptregs_##name)
716 714
717#define PTREGSCALL3(name) \ 715#define PTREGSCALL3(name) \
718 ALIGN; \ 716ENTRY(ptregs_##name) ; \
719ptregs_##name: \
720 CFI_STARTPROC; \ 717 CFI_STARTPROC; \
721 leal 4(%esp),%eax; \ 718 leal 4(%esp),%eax; \
722 pushl_cfi %eax; \ 719 pushl_cfi %eax; \
@@ -741,8 +738,7 @@ PTREGSCALL2(vm86)
741PTREGSCALL1(vm86old) 738PTREGSCALL1(vm86old)
742 739
743/* Clone is an oddball. The 4th arg is in %edi */ 740/* Clone is an oddball. The 4th arg is in %edi */
744 ALIGN; 741ENTRY(ptregs_clone)
745ptregs_clone:
746 CFI_STARTPROC 742 CFI_STARTPROC
747 leal 4(%esp),%eax 743 leal 4(%esp),%eax
748 pushl_cfi %eax 744 pushl_cfi %eax
@@ -1213,11 +1209,6 @@ return_to_handler:
1213 jmp *%ecx 1209 jmp *%ecx
1214#endif 1210#endif
1215 1211
1216.section .rodata,"a"
1217#include "syscall_table_32.S"
1218
1219syscall_table_size=(.-sys_call_table)
1220
1221/* 1212/*
1222 * Some functions should be protected against kprobes 1213 * Some functions should be protected against kprobes
1223 */ 1214 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb9dc87..3fe8239fd8fb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
55#include <asm/paravirt.h> 55#include <asm/paravirt.h>
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <linux/err.h>
58 59
59/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 60/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
60#include <linux/elf-em.h> 61#include <linux/elf-em.h>
@@ -548,7 +549,7 @@ badsys:
548#ifdef CONFIG_AUDITSYSCALL 549#ifdef CONFIG_AUDITSYSCALL
549 /* 550 /*
550 * Fast path for syscall audit without full syscall trace. 551 * Fast path for syscall audit without full syscall trace.
551 * We just call audit_syscall_entry() directly, and then 552 * We just call __audit_syscall_entry() directly, and then
552 * jump back to the normal fast path. 553 * jump back to the normal fast path.
553 */ 554 */
554auditsys: 555auditsys:
@@ -558,22 +559,21 @@ auditsys:
558 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ 559 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
559 movq %rax,%rsi /* 2nd arg: syscall number */ 560 movq %rax,%rsi /* 2nd arg: syscall number */
560 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ 561 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
561 call audit_syscall_entry 562 call __audit_syscall_entry
562 LOAD_ARGS 0 /* reload call-clobbered registers */ 563 LOAD_ARGS 0 /* reload call-clobbered registers */
563 jmp system_call_fastpath 564 jmp system_call_fastpath
564 565
565 /* 566 /*
566 * Return fast path for syscall audit. Call audit_syscall_exit() 567 * Return fast path for syscall audit. Call __audit_syscall_exit()
567 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT 568 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
568 * masked off. 569 * masked off.
569 */ 570 */
570sysret_audit: 571sysret_audit:
571 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ 572 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
572 cmpq $0,%rsi /* is it < 0? */ 573 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
573 setl %al /* 1 if so, 0 if not */ 574 setbe %al /* 1 if so, 0 if not */
574 movzbl %al,%edi /* zero-extend that into %edi */ 575 movzbl %al,%edi /* zero-extend that into %edi */
575 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 576 call __audit_syscall_exit
576 call audit_syscall_exit
577 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 577 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
578 jmp sysret_check 578 jmp sysret_check
579#endif /* CONFIG_AUDITSYSCALL */ 579#endif /* CONFIG_AUDITSYSCALL */
@@ -1480,62 +1480,214 @@ ENTRY(error_exit)
1480 CFI_ENDPROC 1480 CFI_ENDPROC
1481END(error_exit) 1481END(error_exit)
1482 1482
1483/*
1484 * Test if a given stack is an NMI stack or not.
1485 */
1486 .macro test_in_nmi reg stack nmi_ret normal_ret
1487 cmpq %\reg, \stack
1488 ja \normal_ret
1489 subq $EXCEPTION_STKSZ, %\reg
1490 cmpq %\reg, \stack
1491 jb \normal_ret
1492 jmp \nmi_ret
1493 .endm
1483 1494
1484 /* runs on exception stack */ 1495 /* runs on exception stack */
1485ENTRY(nmi) 1496ENTRY(nmi)
1486 INTR_FRAME 1497 INTR_FRAME
1487 PARAVIRT_ADJUST_EXCEPTION_FRAME 1498 PARAVIRT_ADJUST_EXCEPTION_FRAME
1488 pushq_cfi $-1 1499 /*
1500 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1501 * the iretq it performs will take us out of NMI context.
1502 * This means that we can have nested NMIs where the next
1503 * NMI is using the top of the stack of the previous NMI. We
1504 * can't let it execute because the nested NMI will corrupt the
1505 * stack of the previous NMI. NMI handlers are not re-entrant
1506 * anyway.
1507 *
1508 * To handle this case we do the following:
1509 * Check the a special location on the stack that contains
1510 * a variable that is set when NMIs are executing.
1511 * The interrupted task's stack is also checked to see if it
1512 * is an NMI stack.
1513 * If the variable is not set and the stack is not the NMI
1514 * stack then:
1515 * o Set the special variable on the stack
1516 * o Copy the interrupt frame into a "saved" location on the stack
1517 * o Copy the interrupt frame into a "copy" location on the stack
1518 * o Continue processing the NMI
1519 * If the variable is set or the previous stack is the NMI stack:
1520 * o Modify the "copy" location to jump to the repeate_nmi
1521 * o return back to the first NMI
1522 *
1523 * Now on exit of the first NMI, we first clear the stack variable
1524 * The NMI stack will tell any nested NMIs at that point that it is
1525 * nested. Then we pop the stack normally with iret, and if there was
1526 * a nested NMI that updated the copy interrupt stack frame, a
1527 * jump will be made to the repeat_nmi code that will handle the second
1528 * NMI.
1529 */
1530
1531 /* Use %rdx as out temp variable throughout */
1532 pushq_cfi %rdx
1533
1534 /*
1535 * Check the special variable on the stack to see if NMIs are
1536 * executing.
1537 */
1538 cmp $1, -8(%rsp)
1539 je nested_nmi
1540
1541 /*
1542 * Now test if the previous stack was an NMI stack.
1543 * We need the double check. We check the NMI stack to satisfy the
1544 * race when the first NMI clears the variable before returning.
1545 * We check the variable because the first NMI could be in a
1546 * breakpoint routine using a breakpoint stack.
1547 */
1548 lea 6*8(%rsp), %rdx
1549 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1550
1551nested_nmi:
1552 /*
1553 * Do nothing if we interrupted the fixup in repeat_nmi.
1554 * It's about to repeat the NMI handler, so we are fine
1555 * with ignoring this one.
1556 */
1557 movq $repeat_nmi, %rdx
1558 cmpq 8(%rsp), %rdx
1559 ja 1f
1560 movq $end_repeat_nmi, %rdx
1561 cmpq 8(%rsp), %rdx
1562 ja nested_nmi_out
1563
15641:
1565 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1566 leaq -6*8(%rsp), %rdx
1567 movq %rdx, %rsp
1568 CFI_ADJUST_CFA_OFFSET 6*8
1569 pushq_cfi $__KERNEL_DS
1570 pushq_cfi %rdx
1571 pushfq_cfi
1572 pushq_cfi $__KERNEL_CS
1573 pushq_cfi $repeat_nmi
1574
1575 /* Put stack back */
1576 addq $(11*8), %rsp
1577 CFI_ADJUST_CFA_OFFSET -11*8
1578
1579nested_nmi_out:
1580 popq_cfi %rdx
1581
1582 /* No need to check faults here */
1583 INTERRUPT_RETURN
1584
1585first_nmi:
1586 /*
1587 * Because nested NMIs will use the pushed location that we
1588 * stored in rdx, we must keep that space available.
1589 * Here's what our stack frame will look like:
1590 * +-------------------------+
1591 * | original SS |
1592 * | original Return RSP |
1593 * | original RFLAGS |
1594 * | original CS |
1595 * | original RIP |
1596 * +-------------------------+
1597 * | temp storage for rdx |
1598 * +-------------------------+
1599 * | NMI executing variable |
1600 * +-------------------------+
1601 * | Saved SS |
1602 * | Saved Return RSP |
1603 * | Saved RFLAGS |
1604 * | Saved CS |
1605 * | Saved RIP |
1606 * +-------------------------+
1607 * | copied SS |
1608 * | copied Return RSP |
1609 * | copied RFLAGS |
1610 * | copied CS |
1611 * | copied RIP |
1612 * +-------------------------+
1613 * | pt_regs |
1614 * +-------------------------+
1615 *
1616 * The saved RIP is used to fix up the copied RIP that a nested
1617 * NMI may zero out. The original stack frame and the temp storage
1618 * is also used by nested NMIs and can not be trusted on exit.
1619 */
1620 /* Set the NMI executing variable on the stack. */
1621 pushq_cfi $1
1622
1623 /* Copy the stack frame to the Saved frame */
1624 .rept 5
1625 pushq_cfi 6*8(%rsp)
1626 .endr
1627
1628 /* Make another copy, this one may be modified by nested NMIs */
1629 .rept 5
1630 pushq_cfi 4*8(%rsp)
1631 .endr
1632
1633 /* Do not pop rdx, nested NMIs will corrupt it */
1634 movq 11*8(%rsp), %rdx
1635
1636 /*
1637 * Everything below this point can be preempted by a nested
1638 * NMI if the first NMI took an exception. Repeated NMIs
1639 * caused by an exception and nested NMI will start here, and
1640 * can still be preempted by another NMI.
1641 */
1642restart_nmi:
1643 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1489 subq $ORIG_RAX-R15, %rsp 1644 subq $ORIG_RAX-R15, %rsp
1490 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1645 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1646 /*
1647 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1648 * as we should not be calling schedule in NMI context.
1649 * Even with normal interrupts enabled. An NMI should not be
1650 * setting NEED_RESCHED or anything that normal interrupts and
1651 * exceptions might do.
1652 */
1491 call save_paranoid 1653 call save_paranoid
1492 DEFAULT_FRAME 0 1654 DEFAULT_FRAME 0
1493 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1655 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1494 movq %rsp,%rdi 1656 movq %rsp,%rdi
1495 movq $-1,%rsi 1657 movq $-1,%rsi
1496 call do_nmi 1658 call do_nmi
1497#ifdef CONFIG_TRACE_IRQFLAGS
1498 /* paranoidexit; without TRACE_IRQS_OFF */
1499 /* ebx: no swapgs flag */
1500 DISABLE_INTERRUPTS(CLBR_NONE)
1501 testl %ebx,%ebx /* swapgs needed? */ 1659 testl %ebx,%ebx /* swapgs needed? */
1502 jnz nmi_restore 1660 jnz nmi_restore
1503 testl $3,CS(%rsp)
1504 jnz nmi_userspace
1505nmi_swapgs: 1661nmi_swapgs:
1506 SWAPGS_UNSAFE_STACK 1662 SWAPGS_UNSAFE_STACK
1507nmi_restore: 1663nmi_restore:
1508 RESTORE_ALL 8 1664 RESTORE_ALL 8
1665 /* Clear the NMI executing stack variable */
1666 movq $0, 10*8(%rsp)
1509 jmp irq_return 1667 jmp irq_return
1510nmi_userspace:
1511 GET_THREAD_INFO(%rcx)
1512 movl TI_flags(%rcx),%ebx
1513 andl $_TIF_WORK_MASK,%ebx
1514 jz nmi_swapgs
1515 movq %rsp,%rdi /* &pt_regs */
1516 call sync_regs
1517 movq %rax,%rsp /* switch stack for scheduling */
1518 testl $_TIF_NEED_RESCHED,%ebx
1519 jnz nmi_schedule
1520 movl %ebx,%edx /* arg3: thread flags */
1521 ENABLE_INTERRUPTS(CLBR_NONE)
1522 xorl %esi,%esi /* arg2: oldset */
1523 movq %rsp,%rdi /* arg1: &pt_regs */
1524 call do_notify_resume
1525 DISABLE_INTERRUPTS(CLBR_NONE)
1526 jmp nmi_userspace
1527nmi_schedule:
1528 ENABLE_INTERRUPTS(CLBR_ANY)
1529 call schedule
1530 DISABLE_INTERRUPTS(CLBR_ANY)
1531 jmp nmi_userspace
1532 CFI_ENDPROC
1533#else
1534 jmp paranoid_exit
1535 CFI_ENDPROC 1668 CFI_ENDPROC
1536#endif
1537END(nmi) 1669END(nmi)
1538 1670
1671 /*
1672 * If an NMI hit an iret because of an exception or breakpoint,
1673 * it can lose its NMI context, and a nested NMI may come in.
1674 * In that case, the nested NMI will change the preempted NMI's
1675 * stack to jump to here when it does the final iret.
1676 */
1677repeat_nmi:
1678 INTR_FRAME
1679 /* Update the stack variable to say we are still in NMI */
1680 movq $1, 5*8(%rsp)
1681
1682 /* copy the saved stack back to copy stack */
1683 .rept 5
1684 pushq_cfi 4*8(%rsp)
1685 .endr
1686
1687 jmp restart_nmi
1688 CFI_ENDPROC
1689end_repeat_nmi:
1690
1539ENTRY(ignore_sysret) 1691ENTRY(ignore_sysret)
1540 CFI_STARTPROC 1692 CFI_STARTPROC
1541 mov $-ENOSYS,%eax 1693 mov $-ENOSYS,%eax
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11e39478a49..40f4eb3766d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -417,6 +417,10 @@ ENTRY(phys_base)
417ENTRY(idt_table) 417ENTRY(idt_table)
418 .skip IDT_ENTRIES * 16 418 .skip IDT_ENTRIES * 16
419 419
420 .align L1_CACHE_BYTES
421ENTRY(nmi_idt_table)
422 .skip IDT_ENTRIES * 16
423
420 __PAGE_ALIGNED_BSS 424 __PAGE_ALIGNED_BSS
421 .align PAGE_SIZE 425 .align PAGE_SIZE
422ENTRY(empty_zero_page) 426ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 07b0a56a754d..ad0de0c2714e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -2,7 +2,6 @@
2#include <linux/clockchips.h> 2#include <linux/clockchips.h>
3#include <linux/interrupt.h> 3#include <linux/interrupt.h>
4#include <linux/export.h> 4#include <linux/export.h>
5#include <linux/sysdev.h>
6#include <linux/delay.h> 5#include <linux/delay.h>
7#include <linux/errno.h> 6#include <linux/errno.h>
8#include <linux/i8253.h> 7#include <linux/i8253.h>
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a656..40fc86161d92 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs); 28EXPORT_PER_CPU_SYMBOL(irq_regs);
29 29
30#ifdef CONFIG_DEBUG_STACKOVERFLOW 30#ifdef CONFIG_DEBUG_STACKOVERFLOW
31
32int sysctl_panic_on_stackoverflow __read_mostly;
33
31/* Debugging check for stack overflow: is there less than 1KB free? */ 34/* Debugging check for stack overflow: is there less than 1KB free? */
32static int check_stack_overflow(void) 35static int check_stack_overflow(void)
33{ 36{
@@ -43,6 +46,8 @@ static void print_stack_overflow(void)
43{ 46{
44 printk(KERN_WARNING "low stack detected by irq handler\n"); 47 printk(KERN_WARNING "low stack detected by irq handler\n");
45 dump_stack(); 48 dump_stack();
49 if (sysctl_panic_on_stackoverflow)
50 panic("low stack detected by irq handler - check messages\n");
46} 51}
47 52
48#else 53#else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 69bca468c47a..d04d3ecded62 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
26DEFINE_PER_CPU(struct pt_regs *, irq_regs); 26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs); 27EXPORT_PER_CPU_SYMBOL(irq_regs);
28 28
29int sysctl_panic_on_stackoverflow;
30
29/* 31/*
30 * Probabilistic stack overflow check: 32 * Probabilistic stack overflow check:
31 * 33 *
@@ -36,18 +38,39 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
36static inline void stack_overflow_check(struct pt_regs *regs) 38static inline void stack_overflow_check(struct pt_regs *regs)
37{ 39{
38#ifdef CONFIG_DEBUG_STACKOVERFLOW 40#ifdef CONFIG_DEBUG_STACKOVERFLOW
41#define STACK_TOP_MARGIN 128
42 struct orig_ist *oist;
43 u64 irq_stack_top, irq_stack_bottom;
44 u64 estack_top, estack_bottom;
39 u64 curbase = (u64)task_stack_page(current); 45 u64 curbase = (u64)task_stack_page(current);
40 46
41 if (user_mode_vm(regs)) 47 if (user_mode_vm(regs))
42 return; 48 return;
43 49
44 WARN_ONCE(regs->sp >= curbase && 50 if (regs->sp >= curbase + sizeof(struct thread_info) +
45 regs->sp <= curbase + THREAD_SIZE && 51 sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
46 regs->sp < curbase + sizeof(struct thread_info) + 52 regs->sp <= curbase + THREAD_SIZE)
47 sizeof(struct pt_regs) + 128, 53 return;
54
55 irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
56 STACK_TOP_MARGIN;
57 irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
58 if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
59 return;
60
61 oist = &__get_cpu_var(orig_ist);
62 estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
63 estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
64 if (regs->sp >= estack_top && regs->sp <= estack_bottom)
65 return;
66
67 WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
68 current->comm, curbase, regs->sp,
69 irq_stack_top, irq_stack_bottom,
70 estack_top, estack_bottom);
48 71
49 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", 72 if (sysctl_panic_on_stackoverflow)
50 current->comm, curbase, regs->sp); 73 panic("low stack detected by irq handler - check messages\n");
51#endif 74#endif
52} 75}
53 76
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b3300e6bacef..313fb5cddbce 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h> 12#include <linux/device.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include <linux/acpi.h> 14#include <linux/acpi.h>
15#include <linux/io.h> 15#include <linux/io.h>
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a9c2116001d6..f0c6fd6f176b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,8 +39,6 @@
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41 41
42#define MMU_QUEUE_SIZE 1024
43
44static int kvmapf = 1; 42static int kvmapf = 1;
45 43
46static int parse_no_kvmapf(char *arg) 44static int parse_no_kvmapf(char *arg)
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)
60 58
61early_param("no-steal-acc", parse_no_stealacc); 59early_param("no-steal-acc", parse_no_stealacc);
62 60
63struct kvm_para_state {
64 u8 mmu_queue[MMU_QUEUE_SIZE];
65 int mmu_queue_len;
66};
67
68static DEFINE_PER_CPU(struct kvm_para_state, para_state);
69static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 61static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
70static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 62static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
71static int has_steal_clock = 0; 63static int has_steal_clock = 0;
72 64
73static struct kvm_para_state *kvm_para_state(void)
74{
75 return &per_cpu(para_state, raw_smp_processor_id());
76}
77
78/* 65/*
79 * No need for any "IO delay" on KVM 66 * No need for any "IO delay" on KVM
80 */ 67 */
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
271 } 258 }
272} 259}
273 260
274static void kvm_mmu_op(void *buffer, unsigned len)
275{
276 int r;
277 unsigned long a1, a2;
278
279 do {
280 a1 = __pa(buffer);
281 a2 = 0; /* on i386 __pa() always returns <4G */
282 r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
283 buffer += r;
284 len -= r;
285 } while (len);
286}
287
288static void mmu_queue_flush(struct kvm_para_state *state)
289{
290 if (state->mmu_queue_len) {
291 kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
292 state->mmu_queue_len = 0;
293 }
294}
295
296static void kvm_deferred_mmu_op(void *buffer, int len)
297{
298 struct kvm_para_state *state = kvm_para_state();
299
300 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
301 kvm_mmu_op(buffer, len);
302 return;
303 }
304 if (state->mmu_queue_len + len > sizeof state->mmu_queue)
305 mmu_queue_flush(state);
306 memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
307 state->mmu_queue_len += len;
308}
309
310static void kvm_mmu_write(void *dest, u64 val)
311{
312 __u64 pte_phys;
313 struct kvm_mmu_op_write_pte wpte;
314
315#ifdef CONFIG_HIGHPTE
316 struct page *page;
317 unsigned long dst = (unsigned long) dest;
318
319 page = kmap_atomic_to_page(dest);
320 pte_phys = page_to_pfn(page);
321 pte_phys <<= PAGE_SHIFT;
322 pte_phys += (dst & ~(PAGE_MASK));
323#else
324 pte_phys = (unsigned long)__pa(dest);
325#endif
326 wpte.header.op = KVM_MMU_OP_WRITE_PTE;
327 wpte.pte_val = val;
328 wpte.pte_phys = pte_phys;
329
330 kvm_deferred_mmu_op(&wpte, sizeof wpte);
331}
332
333/*
334 * We only need to hook operations that are MMU writes. We hook these so that
335 * we can use lazy MMU mode to batch these operations. We could probably
336 * improve the performance of the host code if we used some of the information
337 * here to simplify processing of batched writes.
338 */
339static void kvm_set_pte(pte_t *ptep, pte_t pte)
340{
341 kvm_mmu_write(ptep, pte_val(pte));
342}
343
344static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
345 pte_t *ptep, pte_t pte)
346{
347 kvm_mmu_write(ptep, pte_val(pte));
348}
349
350static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
351{
352 kvm_mmu_write(pmdp, pmd_val(pmd));
353}
354
355#if PAGETABLE_LEVELS >= 3
356#ifdef CONFIG_X86_PAE
357static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
358{
359 kvm_mmu_write(ptep, pte_val(pte));
360}
361
362static void kvm_pte_clear(struct mm_struct *mm,
363 unsigned long addr, pte_t *ptep)
364{
365 kvm_mmu_write(ptep, 0);
366}
367
368static void kvm_pmd_clear(pmd_t *pmdp)
369{
370 kvm_mmu_write(pmdp, 0);
371}
372#endif
373
374static void kvm_set_pud(pud_t *pudp, pud_t pud)
375{
376 kvm_mmu_write(pudp, pud_val(pud));
377}
378
379#if PAGETABLE_LEVELS == 4
380static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
381{
382 kvm_mmu_write(pgdp, pgd_val(pgd));
383}
384#endif
385#endif /* PAGETABLE_LEVELS >= 3 */
386
387static void kvm_flush_tlb(void)
388{
389 struct kvm_mmu_op_flush_tlb ftlb = {
390 .header.op = KVM_MMU_OP_FLUSH_TLB,
391 };
392
393 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
394}
395
396static void kvm_release_pt(unsigned long pfn)
397{
398 struct kvm_mmu_op_release_pt rpt = {
399 .header.op = KVM_MMU_OP_RELEASE_PT,
400 .pt_phys = (u64)pfn << PAGE_SHIFT,
401 };
402
403 kvm_mmu_op(&rpt, sizeof rpt);
404}
405
406static void kvm_enter_lazy_mmu(void)
407{
408 paravirt_enter_lazy_mmu();
409}
410
411static void kvm_leave_lazy_mmu(void)
412{
413 struct kvm_para_state *state = kvm_para_state();
414
415 mmu_queue_flush(state);
416 paravirt_leave_lazy_mmu();
417}
418
419static void __init paravirt_ops_setup(void) 261static void __init paravirt_ops_setup(void)
420{ 262{
421 pv_info.name = "KVM"; 263 pv_info.name = "KVM";
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)
424 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 266 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
425 pv_cpu_ops.io_delay = kvm_io_delay; 267 pv_cpu_ops.io_delay = kvm_io_delay;
426 268
427 if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
428 pv_mmu_ops.set_pte = kvm_set_pte;
429 pv_mmu_ops.set_pte_at = kvm_set_pte_at;
430 pv_mmu_ops.set_pmd = kvm_set_pmd;
431#if PAGETABLE_LEVELS >= 3
432#ifdef CONFIG_X86_PAE
433 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
434 pv_mmu_ops.pte_clear = kvm_pte_clear;
435 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
436#endif
437 pv_mmu_ops.set_pud = kvm_set_pud;
438#if PAGETABLE_LEVELS == 4
439 pv_mmu_ops.set_pgd = kvm_set_pgd;
440#endif
441#endif
442 pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
443 pv_mmu_ops.release_pte = kvm_release_pt;
444 pv_mmu_ops.release_pmd = kvm_release_pt;
445 pv_mmu_ops.release_pud = kvm_release_pt;
446
447 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
448 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
449 }
450#ifdef CONFIG_X86_IO_APIC 269#ifdef CONFIG_X86_IO_APIC
451 no_timer_check = 1; 270 no_timer_check = 1;
452#endif 271#endif
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9302e2d0eb4b..fda91c307104 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
292 return err; 292 return err;
293} 293}
294 294
295static ssize_t reload_store(struct sys_device *dev, 295static ssize_t reload_store(struct device *dev,
296 struct sysdev_attribute *attr, 296 struct device_attribute *attr,
297 const char *buf, size_t size) 297 const char *buf, size_t size)
298{ 298{
299 unsigned long val; 299 unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
318 return ret; 318 return ret;
319} 319}
320 320
321static ssize_t version_show(struct sys_device *dev, 321static ssize_t version_show(struct device *dev,
322 struct sysdev_attribute *attr, char *buf) 322 struct device_attribute *attr, char *buf)
323{ 323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 324 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
325 325
326 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); 326 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
327} 327}
328 328
329static ssize_t pf_show(struct sys_device *dev, 329static ssize_t pf_show(struct device *dev,
330 struct sysdev_attribute *attr, char *buf) 330 struct device_attribute *attr, char *buf)
331{ 331{
332 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 332 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
333 333
334 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); 334 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
335} 335}
336 336
337static SYSDEV_ATTR(reload, 0200, NULL, reload_store); 337static DEVICE_ATTR(reload, 0200, NULL, reload_store);
338static SYSDEV_ATTR(version, 0400, version_show, NULL); 338static DEVICE_ATTR(version, 0400, version_show, NULL);
339static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); 339static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
340 340
341static struct attribute *mc_default_attrs[] = { 341static struct attribute *mc_default_attrs[] = {
342 &attr_reload.attr, 342 &dev_attr_reload.attr,
343 &attr_version.attr, 343 &dev_attr_version.attr,
344 &attr_processor_flags.attr, 344 &dev_attr_processor_flags.attr,
345 NULL 345 NULL
346}; 346};
347 347
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
405 return ustate; 405 return ustate;
406} 406}
407 407
408static int mc_sysdev_add(struct sys_device *sys_dev) 408static int mc_device_add(struct device *dev, struct subsys_interface *sif)
409{ 409{
410 int err, cpu = sys_dev->id; 410 int err, cpu = dev->id;
411 411
412 if (!cpu_online(cpu)) 412 if (!cpu_online(cpu))
413 return 0; 413 return 0;
414 414
415 pr_debug("CPU%d added\n", cpu); 415 pr_debug("CPU%d added\n", cpu);
416 416
417 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 417 err = sysfs_create_group(&dev->kobj, &mc_attr_group);
418 if (err) 418 if (err)
419 return err; 419 return err;
420 420
421 if (microcode_init_cpu(cpu) == UCODE_ERROR) { 421 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
422 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 422 sysfs_remove_group(&dev->kobj, &mc_attr_group);
423 return -EINVAL; 423 return -EINVAL;
424 } 424 }
425 425
426 return err; 426 return err;
427} 427}
428 428
429static int mc_sysdev_remove(struct sys_device *sys_dev) 429static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
430{ 430{
431 int cpu = sys_dev->id; 431 int cpu = dev->id;
432 432
433 if (!cpu_online(cpu)) 433 if (!cpu_online(cpu))
434 return 0; 434 return 0;
435 435
436 pr_debug("CPU%d removed\n", cpu); 436 pr_debug("CPU%d removed\n", cpu);
437 microcode_fini_cpu(cpu); 437 microcode_fini_cpu(cpu);
438 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 438 sysfs_remove_group(&dev->kobj, &mc_attr_group);
439 return 0; 439 return 0;
440} 440}
441 441
442static struct sysdev_driver mc_sysdev_driver = { 442static struct subsys_interface mc_cpu_interface = {
443 .add = mc_sysdev_add, 443 .name = "microcode",
444 .remove = mc_sysdev_remove, 444 .subsys = &cpu_subsys,
445 .add_dev = mc_device_add,
446 .remove_dev = mc_device_remove,
445}; 447};
446 448
447/** 449/**
@@ -464,9 +466,9 @@ static __cpuinit int
464mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) 466mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
465{ 467{
466 unsigned int cpu = (unsigned long)hcpu; 468 unsigned int cpu = (unsigned long)hcpu;
467 struct sys_device *sys_dev; 469 struct device *dev;
468 470
469 sys_dev = get_cpu_sysdev(cpu); 471 dev = get_cpu_device(cpu);
470 switch (action) { 472 switch (action) {
471 case CPU_ONLINE: 473 case CPU_ONLINE:
472 case CPU_ONLINE_FROZEN: 474 case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
474 case CPU_DOWN_FAILED: 476 case CPU_DOWN_FAILED:
475 case CPU_DOWN_FAILED_FROZEN: 477 case CPU_DOWN_FAILED_FROZEN:
476 pr_debug("CPU%d added\n", cpu); 478 pr_debug("CPU%d added\n", cpu);
477 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 479 if (sysfs_create_group(&dev->kobj, &mc_attr_group))
478 pr_err("Failed to create group for CPU%d\n", cpu); 480 pr_err("Failed to create group for CPU%d\n", cpu);
479 break; 481 break;
480 case CPU_DOWN_PREPARE: 482 case CPU_DOWN_PREPARE:
481 case CPU_DOWN_PREPARE_FROZEN: 483 case CPU_DOWN_PREPARE_FROZEN:
482 /* Suspend is in progress, only remove the interface */ 484 /* Suspend is in progress, only remove the interface */
483 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 485 sysfs_remove_group(&dev->kobj, &mc_attr_group);
484 pr_debug("CPU%d removed\n", cpu); 486 pr_debug("CPU%d removed\n", cpu);
485 break; 487 break;
486 488
@@ -525,7 +527,7 @@ static int __init microcode_init(void)
525 get_online_cpus(); 527 get_online_cpus();
526 mutex_lock(&microcode_mutex); 528 mutex_lock(&microcode_mutex);
527 529
528 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 530 error = subsys_interface_register(&mc_cpu_interface);
529 531
530 mutex_unlock(&microcode_mutex); 532 mutex_unlock(&microcode_mutex);
531 put_online_cpus(); 533 put_online_cpus();
@@ -535,7 +537,7 @@ static int __init microcode_init(void)
535 537
536 error = microcode_dev_init(); 538 error = microcode_dev_init();
537 if (error) 539 if (error)
538 goto out_sysdev_driver; 540 goto out_driver;
539 541
540 register_syscore_ops(&mc_syscore_ops); 542 register_syscore_ops(&mc_syscore_ops);
541 register_hotcpu_notifier(&mc_cpu_notifier); 543 register_hotcpu_notifier(&mc_cpu_notifier);
@@ -545,11 +547,11 @@ static int __init microcode_init(void)
545 547
546 return 0; 548 return 0;
547 549
548out_sysdev_driver: 550out_driver:
549 get_online_cpus(); 551 get_online_cpus();
550 mutex_lock(&microcode_mutex); 552 mutex_lock(&microcode_mutex);
551 553
552 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 554 subsys_interface_unregister(&mc_cpu_interface);
553 555
554 mutex_unlock(&microcode_mutex); 556 mutex_unlock(&microcode_mutex);
555 put_online_cpus(); 557 put_online_cpus();
@@ -573,7 +575,7 @@ static void __exit microcode_exit(void)
573 get_online_cpus(); 575 get_online_cpus();
574 mutex_lock(&microcode_mutex); 576 mutex_lock(&microcode_mutex);
575 577
576 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 578 subsys_interface_unregister(&mc_cpu_interface);
577 579
578 mutex_unlock(&microcode_mutex); 580 mutex_unlock(&microcode_mutex);
579 put_online_cpus(); 581 put_online_cpus();
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2c143e..96356762a51d 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
236 .notifier_call = msr_class_cpu_callback, 236 .notifier_call = msr_class_cpu_callback,
237}; 237};
238 238
239static char *msr_devnode(struct device *dev, mode_t *mode) 239static char *msr_devnode(struct device *dev, umode_t *mode)
240{ 240{
241 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); 241 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
242} 242}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e88f37b58ddd..47acaf319165 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
405 unknown_nmi_error(reason, regs); 405 unknown_nmi_error(reason, regs);
406} 406}
407 407
408/*
409 * NMIs can hit breakpoints which will cause it to lose its
410 * NMI context with the CPU when the breakpoint does an iret.
411 */
412#ifdef CONFIG_X86_32
413/*
414 * For i386, NMIs use the same stack as the kernel, and we can
415 * add a workaround to the iret problem in C. Simply have 3 states
416 * the NMI can be in.
417 *
418 * 1) not running
419 * 2) executing
420 * 3) latched
421 *
422 * When no NMI is in progress, it is in the "not running" state.
423 * When an NMI comes in, it goes into the "executing" state.
424 * Normally, if another NMI is triggered, it does not interrupt
425 * the running NMI and the HW will simply latch it so that when
426 * the first NMI finishes, it will restart the second NMI.
427 * (Note, the latch is binary, thus multiple NMIs triggering,
428 * when one is running, are ignored. Only one NMI is restarted.)
429 *
430 * If an NMI hits a breakpoint that executes an iret, another
431 * NMI can preempt it. We do not want to allow this new NMI
432 * to run, but we want to execute it when the first one finishes.
433 * We set the state to "latched", and the first NMI will perform
434 * an cmpxchg on the state, and if it doesn't successfully
435 * reset the state to "not running" it will restart the next
436 * NMI.
437 */
438enum nmi_states {
439 NMI_NOT_RUNNING,
440 NMI_EXECUTING,
441 NMI_LATCHED,
442};
443static DEFINE_PER_CPU(enum nmi_states, nmi_state);
444
445#define nmi_nesting_preprocess(regs) \
446 do { \
447 if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \
448 __get_cpu_var(nmi_state) = NMI_LATCHED; \
449 return; \
450 } \
451 nmi_restart: \
452 __get_cpu_var(nmi_state) = NMI_EXECUTING; \
453 } while (0)
454
455#define nmi_nesting_postprocess() \
456 do { \
457 if (cmpxchg(&__get_cpu_var(nmi_state), \
458 NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \
459 goto nmi_restart; \
460 } while (0)
461#else /* x86_64 */
462/*
463 * In x86_64 things are a bit more difficult. This has the same problem
464 * where an NMI hitting a breakpoint that calls iret will remove the
465 * NMI context, allowing a nested NMI to enter. What makes this more
466 * difficult is that both NMIs and breakpoints have their own stack.
467 * When a new NMI or breakpoint is executed, the stack is set to a fixed
468 * point. If an NMI is nested, it will have its stack set at that same
469 * fixed address that the first NMI had, and will start corrupting the
470 * stack. This is handled in entry_64.S, but the same problem exists with
471 * the breakpoint stack.
472 *
473 * If a breakpoint is being processed, and the debug stack is being used,
474 * if an NMI comes in and also hits a breakpoint, the stack pointer
475 * will be set to the same fixed address as the breakpoint that was
476 * interrupted, causing that stack to be corrupted. To handle this case,
477 * check if the stack that was interrupted is the debug stack, and if
478 * so, change the IDT so that new breakpoints will use the current stack
479 * and not switch to the fixed address. On return of the NMI, switch back
480 * to the original IDT.
481 */
482static DEFINE_PER_CPU(int, update_debug_stack);
483
484static inline void nmi_nesting_preprocess(struct pt_regs *regs)
485{
486 /*
487 * If we interrupted a breakpoint, it is possible that
488 * the nmi handler will have breakpoints too. We need to
489 * change the IDT such that breakpoints that happen here
490 * continue to use the NMI stack.
491 */
492 if (unlikely(is_debug_stack(regs->sp))) {
493 debug_stack_set_zero();
494 __get_cpu_var(update_debug_stack) = 1;
495 }
496}
497
498static inline void nmi_nesting_postprocess(void)
499{
500 if (unlikely(__get_cpu_var(update_debug_stack)))
501 debug_stack_reset();
502}
503#endif
504
408dotraplinkage notrace __kprobes void 505dotraplinkage notrace __kprobes void
409do_nmi(struct pt_regs *regs, long error_code) 506do_nmi(struct pt_regs *regs, long error_code)
410{ 507{
508 nmi_nesting_preprocess(regs);
509
411 nmi_enter(); 510 nmi_enter();
412 511
413 inc_irq_stat(__nmi_count); 512 inc_irq_stat(__nmi_count);
@@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code)
416 default_do_nmi(regs); 515 default_do_nmi(regs);
417 516
418 nmi_exit(); 517 nmi_exit();
518
519 /* On i386, may loop back to preprocess */
520 nmi_nesting_postprocess();
419} 521}
420 522
421void stop_nmi(void) 523void stop_nmi(void)
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..0d01a8ea4e11
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,180 @@
1/*
2 * arch/x86/kernel/nmi-selftest.c
3 *
4 * Testsuite for NMI: IPIs
5 *
6 * Started by Don Zickus:
7 * (using lib/locking-selftest.c as a guide)
8 *
9 * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
10 */
11
12#include <linux/smp.h>
13#include <linux/cpumask.h>
14#include <linux/delay.h>
15
16#include <asm/apic.h>
17#include <asm/nmi.h>
18
19#define SUCCESS 0
20#define FAILURE 1
21#define TIMEOUT 2
22
23static int nmi_fail;
24
25/* check to see if NMI IPIs work on this machine */
26static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
27
28static int testcase_total;
29static int testcase_successes;
30static int expected_testcase_failures;
31static int unexpected_testcase_failures;
32static int unexpected_testcase_unknowns;
33
34static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
35{
36 unexpected_testcase_unknowns++;
37 return NMI_HANDLED;
38}
39
40static void init_nmi_testsuite(void)
41{
42 /* trap all the unknown NMIs we may generate */
43 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
44}
45
46static void cleanup_nmi_testsuite(void)
47{
48 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
49}
50
51static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
52{
53 int cpu = raw_smp_processor_id();
54
55 if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
56 return NMI_HANDLED;
57
58 return NMI_DONE;
59}
60
61static void test_nmi_ipi(struct cpumask *mask)
62{
63 unsigned long timeout;
64
65 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
66 NMI_FLAG_FIRST, "nmi_selftest")) {
67 nmi_fail = FAILURE;
68 return;
69 }
70
71 /* sync above data before sending NMI */
72 wmb();
73
74 apic->send_IPI_mask(mask, NMI_VECTOR);
75
76 /* Don't wait longer than a second */
77 timeout = USEC_PER_SEC;
78 while (!cpumask_empty(mask) && timeout--)
79 udelay(1);
80
81 /* What happens if we timeout, do we still unregister?? */
82 unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
83
84 if (!timeout)
85 nmi_fail = TIMEOUT;
86 return;
87}
88
89static void remote_ipi(void)
90{
91 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
92 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
93 if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
94 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
95}
96
97static void local_ipi(void)
98{
99 cpumask_clear(to_cpumask(nmi_ipi_mask));
100 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
101 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
102}
103
104static void reset_nmi(void)
105{
106 nmi_fail = 0;
107}
108
109static void dotest(void (*testcase_fn)(void), int expected)
110{
111 testcase_fn();
112 /*
113 * Filter out expected failures:
114 */
115 if (nmi_fail != expected) {
116 unexpected_testcase_failures++;
117
118 if (nmi_fail == FAILURE)
119 printk("FAILED |");
120 else if (nmi_fail == TIMEOUT)
121 printk("TIMEOUT|");
122 else
123 printk("ERROR |");
124 dump_stack();
125 } else {
126 testcase_successes++;
127 printk(" ok |");
128 }
129 testcase_total++;
130
131 reset_nmi();
132}
133
134static inline void print_testname(const char *testname)
135{
136 printk("%12s:", testname);
137}
138
139void nmi_selftest(void)
140{
141 init_nmi_testsuite();
142
143 /*
144 * Run the testsuite:
145 */
146 printk("----------------\n");
147 printk("| NMI testsuite:\n");
148 printk("--------------------\n");
149
150 print_testname("remote IPI");
151 dotest(remote_ipi, SUCCESS);
152 printk("\n");
153 print_testname("local IPI");
154 dotest(local_ipi, SUCCESS);
155 printk("\n");
156
157 cleanup_nmi_testsuite();
158
159 if (unexpected_testcase_failures) {
160 printk("--------------------\n");
161 printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
162 unexpected_testcase_failures, testcase_total);
163 printk("-----------------------------------------------------------------\n");
164 } else if (expected_testcase_failures && testcase_successes) {
165 printk("--------------------\n");
166 printk("%3d out of %3d testcases failed, as expected. |\n",
167 expected_testcase_failures, testcase_total);
168 printk("----------------------------------------------------\n");
169 } else if (expected_testcase_failures && !testcase_successes) {
170 printk("--------------------\n");
171 printk("All %3d testcases failed, as expected. |\n",
172 expected_testcase_failures);
173 printk("----------------------------------------\n");
174 } else {
175 printk("--------------------\n");
176 printk("Good, all %3d testcases passed! |\n",
177 testcase_successes);
178 printk("---------------------------------\n");
179 }
180}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 80dc793b3f63..1c4d769e21ea 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;
45 */ 45 */
46int iommu_pass_through __read_mostly; 46int iommu_pass_through __read_mostly;
47 47
48/*
49 * Group multi-function PCI devices into a single device-group for the
50 * iommu_device_group interface. This tells the iommu driver to pretend
51 * it cannot distinguish between functions of a device, exposing only one
52 * group for the device. Useful for disallowing use of individual PCI
53 * functions from userspace drivers.
54 */
55int iommu_group_mf __read_mostly;
56
48extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; 57extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
49 58
50/* Dummy device used for NULL arguments (normally ISA). */ 59/* Dummy device used for NULL arguments (normally ISA). */
@@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)
169#endif 178#endif
170 if (!strncmp(p, "pt", 2)) 179 if (!strncmp(p, "pt", 2))
171 iommu_pass_through = 1; 180 iommu_pass_through = 1;
181 if (!strncmp(p, "group_mf", 8))
182 iommu_group_mf = 1;
172 183
173 gart_parse_options(p); 184 gart_parse_options(p);
174 185
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 89a04c7b5bb6..50267386b766 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
1392 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1392 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1393 trace_sys_enter(regs, regs->orig_ax); 1393 trace_sys_enter(regs, regs->orig_ax);
1394 1394
1395 if (unlikely(current->audit_context)) { 1395 if (IS_IA32)
1396 if (IS_IA32) 1396 audit_syscall_entry(AUDIT_ARCH_I386,
1397 audit_syscall_entry(AUDIT_ARCH_I386, 1397 regs->orig_ax,
1398 regs->orig_ax, 1398 regs->bx, regs->cx,
1399 regs->bx, regs->cx, 1399 regs->dx, regs->si);
1400 regs->dx, regs->si);
1401#ifdef CONFIG_X86_64 1400#ifdef CONFIG_X86_64
1402 else 1401 else
1403 audit_syscall_entry(AUDIT_ARCH_X86_64, 1402 audit_syscall_entry(AUDIT_ARCH_X86_64,
1404 regs->orig_ax, 1403 regs->orig_ax,
1405 regs->di, regs->si, 1404 regs->di, regs->si,
1406 regs->dx, regs->r10); 1405 regs->dx, regs->r10);
1407#endif 1406#endif
1408 }
1409 1407
1410 return ret ?: regs->orig_ax; 1408 return ret ?: regs->orig_ax;
1411} 1409}
@@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs)
1414{ 1412{
1415 bool step; 1413 bool step;
1416 1414
1417 if (unlikely(current->audit_context)) 1415 audit_syscall_exit(regs);
1418 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1419 1416
1420 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1417 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1421 trace_sys_exit(regs, regs->ax); 1418 trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d05444ac2aea..d7d5099fe874 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -749,12 +749,7 @@ void __init setup_arch(char **cmdline_p)
749#endif 749#endif
750#ifdef CONFIG_EFI 750#ifdef CONFIG_EFI
751 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 751 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
752#ifdef CONFIG_X86_32 752 EFI_LOADER_SIGNATURE, 4)) {
753 "EL32",
754#else
755 "EL64",
756#endif
757 4)) {
758 efi_enabled = 1; 753 efi_enabled = 1;
759 efi_memblock_x86_reserve_range(); 754 efi_memblock_x86_reserve_range();
760 } 755 }
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c1..46a01bdc27e2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
683 struct pt_regs *regs) 683 struct pt_regs *regs)
684{ 684{
685 sigset_t blocked;
686 int ret; 685 int ret;
687 686
688 /* Are we from a system call? */ 687 /* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
733 */ 732 */
734 regs->flags &= ~X86_EFLAGS_TF; 733 regs->flags &= ~X86_EFLAGS_TF;
735 734
736 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask); 735 block_sigmask(ka, sig);
737 if (!(ka->sa.sa_flags & SA_NODEFER))
738 sigaddset(&blocked, sig);
739 set_current_blocked(&blocked);
740 736
741 tracehook_signal_handler(sig, info, ka, regs, 737 tracehook_signal_handler(sig, info, ka, regs,
742 test_thread_flag(TIF_SINGLESTEP)); 738 test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc15484..66c74f481cab 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
29#include <asm/mmu_context.h> 29#include <asm/mmu_context.h>
30#include <asm/proto.h> 30#include <asm/proto.h>
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/nmi.h>
32/* 33/*
33 * Some notes on x86 processor bugs affecting SMP operation: 34 * Some notes on x86 processor bugs affecting SMP operation:
34 * 35 *
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
148 free_cpumask_var(allbutself); 149 free_cpumask_var(allbutself);
149} 150}
150 151
152static atomic_t stopping_cpu = ATOMIC_INIT(-1);
153
154static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
155{
156 /* We are registered on stopping cpu too, avoid spurious NMI */
157 if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
158 return NMI_HANDLED;
159
160 stop_this_cpu(NULL);
161
162 return NMI_HANDLED;
163}
164
165static void native_nmi_stop_other_cpus(int wait)
166{
167 unsigned long flags;
168 unsigned long timeout;
169
170 if (reboot_force)
171 return;
172
173 /*
174 * Use an own vector here because smp_call_function
175 * does lots of things not suitable in a panic situation.
176 */
177 if (num_online_cpus() > 1) {
178 /* did someone beat us here? */
179 if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
180 return;
181
182 if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
183 NMI_FLAG_FIRST, "smp_stop"))
184 /* Note: we ignore failures here */
185 return;
186
187 /* sync above data before sending NMI */
188 wmb();
189
190 apic->send_IPI_allbutself(NMI_VECTOR);
191
192 /*
193 * Don't wait longer than a second if the caller
194 * didn't ask us to wait.
195 */
196 timeout = USEC_PER_SEC;
197 while (num_online_cpus() > 1 && (wait || timeout--))
198 udelay(1);
199 }
200
201 local_irq_save(flags);
202 disable_local_APIC();
203 local_irq_restore(flags);
204}
205
151/* 206/*
152 * this function calls the 'stop' function on all other CPUs in the system. 207 * this function calls the 'stop' function on all other CPUs in the system.
153 */ 208 */
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
160 irq_exit(); 215 irq_exit();
161} 216}
162 217
163static void native_stop_other_cpus(int wait) 218static void native_irq_stop_other_cpus(int wait)
164{ 219{
165 unsigned long flags; 220 unsigned long flags;
166 unsigned long timeout; 221 unsigned long timeout;
@@ -194,6 +249,11 @@ static void native_stop_other_cpus(int wait)
194 local_irq_restore(flags); 249 local_irq_restore(flags);
195} 250}
196 251
252static void native_smp_disable_nmi_ipi(void)
253{
254 smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
255}
256
197/* 257/*
198 * Reschedule call back. 258 * Reschedule call back.
199 */ 259 */
@@ -225,12 +285,20 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
225 irq_exit(); 285 irq_exit();
226} 286}
227 287
288static int __init nonmi_ipi_setup(char *str)
289{
290 native_smp_disable_nmi_ipi();
291 return 1;
292}
293
294__setup("nonmi_ipi", nonmi_ipi_setup);
295
228struct smp_ops smp_ops = { 296struct smp_ops smp_ops = {
229 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 297 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
230 .smp_prepare_cpus = native_smp_prepare_cpus, 298 .smp_prepare_cpus = native_smp_prepare_cpus,
231 .smp_cpus_done = native_smp_cpus_done, 299 .smp_cpus_done = native_smp_cpus_done,
232 300
233 .stop_other_cpus = native_stop_other_cpus, 301 .stop_other_cpus = native_nmi_stop_other_cpus,
234 .smp_send_reschedule = native_smp_send_reschedule, 302 .smp_send_reschedule = native_smp_send_reschedule,
235 303
236 .cpu_up = native_cpu_up, 304 .cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e38e21754eea..66d250c00d11 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void)
207 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
208 */ 208 */
209 setup_vector_irq(smp_processor_id()); 209 setup_vector_irq(smp_processor_id());
210
211 /*
212 * Save our processor parameters. Note: this information
213 * is needed for clock calibration.
214 */
215 smp_store_cpu_info(cpuid);
216
210 /* 217 /*
211 * Get our bogomips. 218 * Get our bogomips.
219 * Update loops_per_jiffy in cpu_data. Previous call to
220 * smp_store_cpu_info() stored a value that is close but not as
221 * accurate as the value just calculated.
212 * 222 *
213 * Need to enable IRQs because it can take longer and then 223 * Need to enable IRQs because it can take longer and then
214 * the NMI watchdog might kill us. 224 * the NMI watchdog might kill us.
215 */ 225 */
216 local_irq_enable(); 226 local_irq_enable();
217 calibrate_delay(); 227 calibrate_delay();
228 cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
218 local_irq_disable(); 229 local_irq_disable();
219 pr_debug("Stack at about %p\n", &cpuid); 230 pr_debug("Stack at about %p\n", &cpuid);
220 231
221 /* 232 /*
222 * Save our processor parameters
223 */
224 smp_store_cpu_info(cpuid);
225
226 /*
227 * This must be done before setting cpu_online_mask 233 * This must be done before setting cpu_online_mask
228 * or calling notify_cpu_starting. 234 * or calling notify_cpu_starting.
229 */ 235 */
@@ -1143,6 +1149,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1143{ 1149{
1144 pr_debug("Boot done.\n"); 1150 pr_debug("Boot done.\n");
1145 1151
1152 nmi_selftest();
1146 impress_friends(); 1153 impress_friends();
1147#ifdef CONFIG_X86_IO_APIC 1154#ifdef CONFIG_X86_IO_APIC
1148 setup_ioapic_dest(); 1155 setup_ioapic_dest();
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
new file mode 100644
index 000000000000..147fcd4941c4
--- /dev/null
+++ b/arch/x86/kernel/syscall_32.c
@@ -0,0 +1,25 @@
1/* System call table for i386. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
13
14typedef asmlinkage void (*sys_call_ptr_t)(void);
15
16extern asmlinkage void sys_ni_syscall(void);
17
18const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_syscall_max] = &sys_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d6008295..7ac7943be02c 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,15 +5,11 @@
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8#define __NO_STUBS 8#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
9#include <asm/syscalls_64.h>
10#undef __SYSCALL_64
9 11
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 12#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
11#undef _ASM_X86_UNISTD_64_H
12#include <asm/unistd_64.h>
13
14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [nr] = sym,
16#undef _ASM_X86_UNISTD_64_H
17 13
18typedef void (*sys_call_ptr_t)(void); 14typedef void (*sys_call_ptr_t)(void);
19 15
@@ -21,9 +17,9 @@ extern void sys_ni_syscall(void);
21 17
22const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 18const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
23 /* 19 /*
24 *Smells like a like a compiler bug -- it doesn't work 20 * Smells like a compiler bug -- it doesn't work
25 *when the & below is removed. 21 * when the & below is removed.
26 */ 22 */
27 [0 ... __NR_syscall_max] = &sys_ni_syscall, 23 [0 ... __NR_syscall_max] = &sys_ni_syscall,
28#include <asm/unistd_64.h> 24#include <asm/syscalls_64.h>
29}; 25};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index 9a0e31293920..000000000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,350 +0,0 @@
1ENTRY(sys_call_table)
2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
3 .long sys_exit
4 .long ptregs_fork
5 .long sys_read
6 .long sys_write
7 .long sys_open /* 5 */
8 .long sys_close
9 .long sys_waitpid
10 .long sys_creat
11 .long sys_link
12 .long sys_unlink /* 10 */
13 .long ptregs_execve
14 .long sys_chdir
15 .long sys_time
16 .long sys_mknod
17 .long sys_chmod /* 15 */
18 .long sys_lchown16
19 .long sys_ni_syscall /* old break syscall holder */
20 .long sys_stat
21 .long sys_lseek
22 .long sys_getpid /* 20 */
23 .long sys_mount
24 .long sys_oldumount
25 .long sys_setuid16
26 .long sys_getuid16
27 .long sys_stime /* 25 */
28 .long sys_ptrace
29 .long sys_alarm
30 .long sys_fstat
31 .long sys_pause
32 .long sys_utime /* 30 */
33 .long sys_ni_syscall /* old stty syscall holder */
34 .long sys_ni_syscall /* old gtty syscall holder */
35 .long sys_access
36 .long sys_nice
37 .long sys_ni_syscall /* 35 - old ftime syscall holder */
38 .long sys_sync
39 .long sys_kill
40 .long sys_rename
41 .long sys_mkdir
42 .long sys_rmdir /* 40 */
43 .long sys_dup
44 .long sys_pipe
45 .long sys_times
46 .long sys_ni_syscall /* old prof syscall holder */
47 .long sys_brk /* 45 */
48 .long sys_setgid16
49 .long sys_getgid16
50 .long sys_signal
51 .long sys_geteuid16
52 .long sys_getegid16 /* 50 */
53 .long sys_acct
54 .long sys_umount /* recycled never used phys() */
55 .long sys_ni_syscall /* old lock syscall holder */
56 .long sys_ioctl
57 .long sys_fcntl /* 55 */
58 .long sys_ni_syscall /* old mpx syscall holder */
59 .long sys_setpgid
60 .long sys_ni_syscall /* old ulimit syscall holder */
61 .long sys_olduname
62 .long sys_umask /* 60 */
63 .long sys_chroot
64 .long sys_ustat
65 .long sys_dup2
66 .long sys_getppid
67 .long sys_getpgrp /* 65 */
68 .long sys_setsid
69 .long sys_sigaction
70 .long sys_sgetmask
71 .long sys_ssetmask
72 .long sys_setreuid16 /* 70 */
73 .long sys_setregid16
74 .long sys_sigsuspend
75 .long sys_sigpending
76 .long sys_sethostname
77 .long sys_setrlimit /* 75 */
78 .long sys_old_getrlimit
79 .long sys_getrusage
80 .long sys_gettimeofday
81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16
84 .long sys_old_select
85 .long sys_symlink
86 .long sys_lstat
87 .long sys_readlink /* 85 */
88 .long sys_uselib
89 .long sys_swapon
90 .long sys_reboot
91 .long sys_old_readdir
92 .long sys_old_mmap /* 90 */
93 .long sys_munmap
94 .long sys_truncate
95 .long sys_ftruncate
96 .long sys_fchmod
97 .long sys_fchown16 /* 95 */
98 .long sys_getpriority
99 .long sys_setpriority
100 .long sys_ni_syscall /* old profil syscall holder */
101 .long sys_statfs
102 .long sys_fstatfs /* 100 */
103 .long sys_ioperm
104 .long sys_socketcall
105 .long sys_syslog
106 .long sys_setitimer
107 .long sys_getitimer /* 105 */
108 .long sys_newstat
109 .long sys_newlstat
110 .long sys_newfstat
111 .long sys_uname
112 .long ptregs_iopl /* 110 */
113 .long sys_vhangup
114 .long sys_ni_syscall /* old "idle" system call */
115 .long ptregs_vm86old
116 .long sys_wait4
117 .long sys_swapoff /* 115 */
118 .long sys_sysinfo
119 .long sys_ipc
120 .long sys_fsync
121 .long ptregs_sigreturn
122 .long ptregs_clone /* 120 */
123 .long sys_setdomainname
124 .long sys_newuname
125 .long sys_modify_ldt
126 .long sys_adjtimex
127 .long sys_mprotect /* 125 */
128 .long sys_sigprocmask
129 .long sys_ni_syscall /* old "create_module" */
130 .long sys_init_module
131 .long sys_delete_module
132 .long sys_ni_syscall /* 130: old "get_kernel_syms" */
133 .long sys_quotactl
134 .long sys_getpgid
135 .long sys_fchdir
136 .long sys_bdflush
137 .long sys_sysfs /* 135 */
138 .long sys_personality
139 .long sys_ni_syscall /* reserved for afs_syscall */
140 .long sys_setfsuid16
141 .long sys_setfsgid16
142 .long sys_llseek /* 140 */
143 .long sys_getdents
144 .long sys_select
145 .long sys_flock
146 .long sys_msync
147 .long sys_readv /* 145 */
148 .long sys_writev
149 .long sys_getsid
150 .long sys_fdatasync
151 .long sys_sysctl
152 .long sys_mlock /* 150 */
153 .long sys_munlock
154 .long sys_mlockall
155 .long sys_munlockall
156 .long sys_sched_setparam
157 .long sys_sched_getparam /* 155 */
158 .long sys_sched_setscheduler
159 .long sys_sched_getscheduler
160 .long sys_sched_yield
161 .long sys_sched_get_priority_max
162 .long sys_sched_get_priority_min /* 160 */
163 .long sys_sched_rr_get_interval
164 .long sys_nanosleep
165 .long sys_mremap
166 .long sys_setresuid16
167 .long sys_getresuid16 /* 165 */
168 .long ptregs_vm86
169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll
171 .long sys_ni_syscall /* Old nfsservctl */
172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16
174 .long sys_prctl
175 .long ptregs_rt_sigreturn
176 .long sys_rt_sigaction
177 .long sys_rt_sigprocmask /* 175 */
178 .long sys_rt_sigpending
179 .long sys_rt_sigtimedwait
180 .long sys_rt_sigqueueinfo
181 .long sys_rt_sigsuspend
182 .long sys_pread64 /* 180 */
183 .long sys_pwrite64
184 .long sys_chown16
185 .long sys_getcwd
186 .long sys_capget
187 .long sys_capset /* 185 */
188 .long ptregs_sigaltstack
189 .long sys_sendfile
190 .long sys_ni_syscall /* reserved for streams1 */
191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit
194 .long sys_mmap_pgoff
195 .long sys_truncate64
196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */
198 .long sys_lstat64
199 .long sys_fstat64
200 .long sys_lchown
201 .long sys_getuid
202 .long sys_getgid /* 200 */
203 .long sys_geteuid
204 .long sys_getegid
205 .long sys_setreuid
206 .long sys_setregid
207 .long sys_getgroups /* 205 */
208 .long sys_setgroups
209 .long sys_fchown
210 .long sys_setresuid
211 .long sys_getresuid
212 .long sys_setresgid /* 210 */
213 .long sys_getresgid
214 .long sys_chown
215 .long sys_setuid
216 .long sys_setgid
217 .long sys_setfsuid /* 215 */
218 .long sys_setfsgid
219 .long sys_pivot_root
220 .long sys_mincore
221 .long sys_madvise
222 .long sys_getdents64 /* 220 */
223 .long sys_fcntl64
224 .long sys_ni_syscall /* reserved for TUX */
225 .long sys_ni_syscall
226 .long sys_gettid
227 .long sys_readahead /* 225 */
228 .long sys_setxattr
229 .long sys_lsetxattr
230 .long sys_fsetxattr
231 .long sys_getxattr
232 .long sys_lgetxattr /* 230 */
233 .long sys_fgetxattr
234 .long sys_listxattr
235 .long sys_llistxattr
236 .long sys_flistxattr
237 .long sys_removexattr /* 235 */
238 .long sys_lremovexattr
239 .long sys_fremovexattr
240 .long sys_tkill
241 .long sys_sendfile64
242 .long sys_futex /* 240 */
243 .long sys_sched_setaffinity
244 .long sys_sched_getaffinity
245 .long sys_set_thread_area
246 .long sys_get_thread_area
247 .long sys_io_setup /* 245 */
248 .long sys_io_destroy
249 .long sys_io_getevents
250 .long sys_io_submit
251 .long sys_io_cancel
252 .long sys_fadvise64 /* 250 */
253 .long sys_ni_syscall
254 .long sys_exit_group
255 .long sys_lookup_dcookie
256 .long sys_epoll_create
257 .long sys_epoll_ctl /* 255 */
258 .long sys_epoll_wait
259 .long sys_remap_file_pages
260 .long sys_set_tid_address
261 .long sys_timer_create
262 .long sys_timer_settime /* 260 */
263 .long sys_timer_gettime
264 .long sys_timer_getoverrun
265 .long sys_timer_delete
266 .long sys_clock_settime
267 .long sys_clock_gettime /* 265 */
268 .long sys_clock_getres
269 .long sys_clock_nanosleep
270 .long sys_statfs64
271 .long sys_fstatfs64
272 .long sys_tgkill /* 270 */
273 .long sys_utimes
274 .long sys_fadvise64_64
275 .long sys_ni_syscall /* sys_vserver */
276 .long sys_mbind
277 .long sys_get_mempolicy
278 .long sys_set_mempolicy
279 .long sys_mq_open
280 .long sys_mq_unlink
281 .long sys_mq_timedsend
282 .long sys_mq_timedreceive /* 280 */
283 .long sys_mq_notify
284 .long sys_mq_getsetattr
285 .long sys_kexec_load
286 .long sys_waitid
287 .long sys_ni_syscall /* 285 */ /* available */
288 .long sys_add_key
289 .long sys_request_key
290 .long sys_keyctl
291 .long sys_ioprio_set
292 .long sys_ioprio_get /* 290 */
293 .long sys_inotify_init
294 .long sys_inotify_add_watch
295 .long sys_inotify_rm_watch
296 .long sys_migrate_pages
297 .long sys_openat /* 295 */
298 .long sys_mkdirat
299 .long sys_mknodat
300 .long sys_fchownat
301 .long sys_futimesat
302 .long sys_fstatat64 /* 300 */
303 .long sys_unlinkat
304 .long sys_renameat
305 .long sys_linkat
306 .long sys_symlinkat
307 .long sys_readlinkat /* 305 */
308 .long sys_fchmodat
309 .long sys_faccessat
310 .long sys_pselect6
311 .long sys_ppoll
312 .long sys_unshare /* 310 */
313 .long sys_set_robust_list
314 .long sys_get_robust_list
315 .long sys_splice
316 .long sys_sync_file_range
317 .long sys_tee /* 315 */
318 .long sys_vmsplice
319 .long sys_move_pages
320 .long sys_getcpu
321 .long sys_epoll_pwait
322 .long sys_utimensat /* 320 */
323 .long sys_signalfd
324 .long sys_timerfd_create
325 .long sys_eventfd
326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
335 .long sys_preadv
336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open
339 .long sys_recvmmsg
340 .long sys_fanotify_init
341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */
343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
346 .long sys_syncfs
347 .long sys_sendmmsg /* 345 */
348 .long sys_setns
349 .long sys_process_vm_readv
350 .long sys_process_vm_writev
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index fa1191fb679d..482ec3af2067 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -311,9 +311,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
311 == NOTIFY_STOP) 311 == NOTIFY_STOP)
312 return; 312 return;
313 313
314 /*
315 * Let others (NMI) know that the debug stack is in use
316 * as we may switch to the interrupt stack.
317 */
318 debug_stack_usage_inc();
314 preempt_conditional_sti(regs); 319 preempt_conditional_sti(regs);
315 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); 320 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
316 preempt_conditional_cli(regs); 321 preempt_conditional_cli(regs);
322 debug_stack_usage_dec();
317} 323}
318 324
319#ifdef CONFIG_X86_64 325#ifdef CONFIG_X86_64
@@ -406,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
406 SIGTRAP) == NOTIFY_STOP) 412 SIGTRAP) == NOTIFY_STOP)
407 return; 413 return;
408 414
415 /*
416 * Let others (NMI) know that the debug stack is in use
417 * as we may switch to the interrupt stack.
418 */
419 debug_stack_usage_inc();
420
409 /* It's safe to allow irq's after DR6 has been saved */ 421 /* It's safe to allow irq's after DR6 has been saved */
410 preempt_conditional_sti(regs); 422 preempt_conditional_sti(regs);
411 423
@@ -413,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
413 handle_vm86_trap((struct kernel_vm86_regs *) regs, 425 handle_vm86_trap((struct kernel_vm86_regs *) regs,
414 error_code, 1); 426 error_code, 1);
415 preempt_conditional_cli(regs); 427 preempt_conditional_cli(regs);
428 debug_stack_usage_dec();
416 return; 429 return;
417 } 430 }
418 431
@@ -432,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
432 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) 445 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
433 send_sigtrap(tsk, regs, error_code, si_code); 446 send_sigtrap(tsk, regs, error_code, si_code);
434 preempt_conditional_cli(regs); 447 preempt_conditional_cli(regs);
448 debug_stack_usage_dec();
435 449
436 return; 450 return;
437} 451}
@@ -718,4 +732,10 @@ void __init trap_init(void)
718 cpu_init(); 732 cpu_init();
719 733
720 x86_init.irqs.trap_init(); 734 x86_init.irqs.trap_init();
735
736#ifdef CONFIG_X86_64
737 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
738 set_nmi_gate(1, &debug);
739 set_nmi_gate(3, &int3);
740#endif
721} 741}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f54694611172..a62c201c97ec 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -993,3 +993,23 @@ void __init tsc_init(void)
993 check_system_tsc_reliable(); 993 check_system_tsc_reliable();
994} 994}
995 995
996#ifdef CONFIG_SMP
997/*
998 * If we have a constant TSC and are using the TSC for the delay loop,
999 * we can skip clock calibration if another cpu in the same socket has already
1000 * been calibrated. This assumes that CONSTANT_TSC applies to all
1001 * cpus in the socket - this should be a safe assumption.
1002 */
1003unsigned long __cpuinit calibrate_delay_is_known(void)
1004{
1005 int i, cpu = smp_processor_id();
1006
1007 if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
1008 return 0;
1009
1010 for_each_online_cpu(i)
1011 if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
1012 return cpu_data(i).loops_per_jiffy;
1013 return 0;
1014}
1015#endif
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f8753ab0a..b466cab5ba15 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
335 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
336 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
337 337
338 /*call audit_syscall_exit since we do not exit via the normal paths */ 338 /*call __audit_syscall_exit since we do not exit via the normal paths */
339#ifdef CONFIG_AUDITSYSCALL
339 if (unlikely(current->audit_context)) 340 if (unlikely(current->audit_context))
340 audit_syscall_exit(AUDITSC_RESULT(0), 0); 341 __audit_syscall_exit(1, 0);
342#endif
341 343
342 __asm__ __volatile__( 344 __asm__ __volatile__(
343 "movl %0,%%esp\n\t" 345 "movl %0,%%esp\n\t"
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 91f83e21b989..947a06ccc673 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -115,4 +115,5 @@ struct x86_msi_ops x86_msi = {
115 .setup_msi_irqs = native_setup_msi_irqs, 115 .setup_msi_irqs = native_setup_msi_irqs,
116 .teardown_msi_irq = native_teardown_msi_irq, 116 .teardown_msi_irq = native_teardown_msi_irq,
117 .teardown_msi_irqs = default_teardown_msi_irqs, 117 .teardown_msi_irqs = default_teardown_msi_irqs,
118 .restore_msi_irqs = default_restore_msi_irqs,
118}; 119};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ff5790d8e990..1a7fe868f375 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -35,6 +35,7 @@ config KVM
35 select KVM_MMIO 35 select KVM_MMIO
36 select TASKSTATS 36 select TASKSTATS
37 select TASK_DELAY_ACCT 37 select TASK_DELAY_ACCT
38 select PERF_EVENTS
38 ---help--- 39 ---help---
39 Support hosting fully virtualized guest machines using hardware 40 Support hosting fully virtualized guest machines using hardware
40 virtualization extensions. You will need a fairly recent 41 virtualization extensions. You will need a fairly recent
@@ -52,6 +53,8 @@ config KVM
52config KVM_INTEL 53config KVM_INTEL
53 tristate "KVM for Intel processors support" 54 tristate "KVM for Intel processors support"
54 depends on KVM 55 depends on KVM
56 # for perf_guest_get_msrs():
57 depends on CPU_SUP_INTEL
55 ---help--- 58 ---help---
56 Provides support for KVM on Intel processors equipped with the VT 59 Provides support for KVM on Intel processors equipped with the VT
57 extensions. 60 extensions.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f431c8..4f579e8dcacf 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
13 13
14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
15 i8254.o timer.o 15 i8254.o timer.o cpuid.o pmu.o
16kvm-intel-y += vmx.o 16kvm-intel-y += vmx.o
17kvm-amd-y += svm.o 17kvm-amd-y += svm.o
18 18
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000000..89b02bfaaca5
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,670 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 * cpuid support routines
4 *
5 * derived from arch/x86/kvm/x86.c
6 *
7 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
8 * Copyright IBM Corporation, 2008
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
12 *
13 */
14
15#include <linux/kvm_host.h>
16#include <linux/module.h>
17#include <linux/vmalloc.h>
18#include <linux/uaccess.h>
19#include <asm/user.h>
20#include <asm/xsave.h>
21#include "cpuid.h"
22#include "lapic.h"
23#include "mmu.h"
24#include "trace.h"
25
26void kvm_update_cpuid(struct kvm_vcpu *vcpu)
27{
28 struct kvm_cpuid_entry2 *best;
29 struct kvm_lapic *apic = vcpu->arch.apic;
30
31 best = kvm_find_cpuid_entry(vcpu, 1, 0);
32 if (!best)
33 return;
34
35 /* Update OSXSAVE bit */
36 if (cpu_has_xsave && best->function == 0x1) {
37 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
38 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
39 best->ecx |= bit(X86_FEATURE_OSXSAVE);
40 }
41
42 if (apic) {
43 if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
44 apic->lapic_timer.timer_mode_mask = 3 << 17;
45 else
46 apic->lapic_timer.timer_mode_mask = 1 << 17;
47 }
48
49 kvm_pmu_cpuid_update(vcpu);
50}
51
52static int is_efer_nx(void)
53{
54 unsigned long long efer = 0;
55
56 rdmsrl_safe(MSR_EFER, &efer);
57 return efer & EFER_NX;
58}
59
60static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
61{
62 int i;
63 struct kvm_cpuid_entry2 *e, *entry;
64
65 entry = NULL;
66 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
67 e = &vcpu->arch.cpuid_entries[i];
68 if (e->function == 0x80000001) {
69 entry = e;
70 break;
71 }
72 }
73 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
74 entry->edx &= ~(1 << 20);
75 printk(KERN_INFO "kvm: guest NX capability removed\n");
76 }
77}
78
79/* when an old userspace process fills a new kernel module */
80int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
81 struct kvm_cpuid *cpuid,
82 struct kvm_cpuid_entry __user *entries)
83{
84 int r, i;
85 struct kvm_cpuid_entry *cpuid_entries;
86
87 r = -E2BIG;
88 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
89 goto out;
90 r = -ENOMEM;
91 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
92 if (!cpuid_entries)
93 goto out;
94 r = -EFAULT;
95 if (copy_from_user(cpuid_entries, entries,
96 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
97 goto out_free;
98 for (i = 0; i < cpuid->nent; i++) {
99 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
100 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
101 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
102 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
103 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
104 vcpu->arch.cpuid_entries[i].index = 0;
105 vcpu->arch.cpuid_entries[i].flags = 0;
106 vcpu->arch.cpuid_entries[i].padding[0] = 0;
107 vcpu->arch.cpuid_entries[i].padding[1] = 0;
108 vcpu->arch.cpuid_entries[i].padding[2] = 0;
109 }
110 vcpu->arch.cpuid_nent = cpuid->nent;
111 cpuid_fix_nx_cap(vcpu);
112 r = 0;
113 kvm_apic_set_version(vcpu);
114 kvm_x86_ops->cpuid_update(vcpu);
115 kvm_update_cpuid(vcpu);
116
117out_free:
118 vfree(cpuid_entries);
119out:
120 return r;
121}
122
123int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
124 struct kvm_cpuid2 *cpuid,
125 struct kvm_cpuid_entry2 __user *entries)
126{
127 int r;
128
129 r = -E2BIG;
130 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
131 goto out;
132 r = -EFAULT;
133 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
134 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
135 goto out;
136 vcpu->arch.cpuid_nent = cpuid->nent;
137 kvm_apic_set_version(vcpu);
138 kvm_x86_ops->cpuid_update(vcpu);
139 kvm_update_cpuid(vcpu);
140 return 0;
141
142out:
143 return r;
144}
145
146int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
147 struct kvm_cpuid2 *cpuid,
148 struct kvm_cpuid_entry2 __user *entries)
149{
150 int r;
151
152 r = -E2BIG;
153 if (cpuid->nent < vcpu->arch.cpuid_nent)
154 goto out;
155 r = -EFAULT;
156 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
157 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
158 goto out;
159 return 0;
160
161out:
162 cpuid->nent = vcpu->arch.cpuid_nent;
163 return r;
164}
165
166static void cpuid_mask(u32 *word, int wordnum)
167{
168 *word &= boot_cpu_data.x86_capability[wordnum];
169}
170
171static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
172 u32 index)
173{
174 entry->function = function;
175 entry->index = index;
176 cpuid_count(entry->function, entry->index,
177 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
178 entry->flags = 0;
179}
180
181static bool supported_xcr0_bit(unsigned bit)
182{
183 u64 mask = ((u64)1 << bit);
184
185 return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
186}
187
188#define F(x) bit(X86_FEATURE_##x)
189
190static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
191 u32 index, int *nent, int maxnent)
192{
193 int r;
194 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
195#ifdef CONFIG_X86_64
196 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
197 ? F(GBPAGES) : 0;
198 unsigned f_lm = F(LM);
199#else
200 unsigned f_gbpages = 0;
201 unsigned f_lm = 0;
202#endif
203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
204
205 /* cpuid 1.edx */
206 const u32 kvm_supported_word0_x86_features =
207 F(FPU) | F(VME) | F(DE) | F(PSE) |
208 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
209 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
210 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
211 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
212 0 /* Reserved, DS, ACPI */ | F(MMX) |
213 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
214 0 /* HTT, TM, Reserved, PBE */;
215 /* cpuid 0x80000001.edx */
216 const u32 kvm_supported_word1_x86_features =
217 F(FPU) | F(VME) | F(DE) | F(PSE) |
218 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
219 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
220 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
221 F(PAT) | F(PSE36) | 0 /* Reserved */ |
222 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
223 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
224 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
225 /* cpuid 1.ecx */
226 const u32 kvm_supported_word4_x86_features =
227 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
228 0 /* DS-CPL, VMX, SMX, EST */ |
229 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
230 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
231 0 /* Reserved, DCA */ | F(XMM4_1) |
232 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
233 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
234 F(F16C) | F(RDRAND);
235 /* cpuid 0x80000001.ecx */
236 const u32 kvm_supported_word6_x86_features =
237 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
238 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
239 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
240 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
241
242 /* cpuid 0xC0000001.edx */
243 const u32 kvm_supported_word5_x86_features =
244 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
245 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
246 F(PMM) | F(PMM_EN);
247
248 /* cpuid 7.0.ebx */
249 const u32 kvm_supported_word9_x86_features =
250 F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
251
252 /* all calls to cpuid_count() should be made on the same cpu */
253 get_cpu();
254
255 r = -E2BIG;
256
257 if (*nent >= maxnent)
258 goto out;
259
260 do_cpuid_1_ent(entry, function, index);
261 ++*nent;
262
263 switch (function) {
264 case 0:
265 entry->eax = min(entry->eax, (u32)0xd);
266 break;
267 case 1:
268 entry->edx &= kvm_supported_word0_x86_features;
269 cpuid_mask(&entry->edx, 0);
270 entry->ecx &= kvm_supported_word4_x86_features;
271 cpuid_mask(&entry->ecx, 4);
272 /* we support x2apic emulation even if host does not support
273 * it since we emulate x2apic in software */
274 entry->ecx |= F(X2APIC);
275 break;
276 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
277 * may return different values. This forces us to get_cpu() before
278 * issuing the first command, and also to emulate this annoying behavior
279 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
280 case 2: {
281 int t, times = entry->eax & 0xff;
282
283 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
284 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
285 for (t = 1; t < times; ++t) {
286 if (*nent >= maxnent)
287 goto out;
288
289 do_cpuid_1_ent(&entry[t], function, 0);
290 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
291 ++*nent;
292 }
293 break;
294 }
295 /* function 4 has additional index. */
296 case 4: {
297 int i, cache_type;
298
299 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
300 /* read more entries until cache_type is zero */
301 for (i = 1; ; ++i) {
302 if (*nent >= maxnent)
303 goto out;
304
305 cache_type = entry[i - 1].eax & 0x1f;
306 if (!cache_type)
307 break;
308 do_cpuid_1_ent(&entry[i], function, i);
309 entry[i].flags |=
310 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
311 ++*nent;
312 }
313 break;
314 }
315 case 7: {
316 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
317 /* Mask ebx against host capbability word 9 */
318 if (index == 0) {
319 entry->ebx &= kvm_supported_word9_x86_features;
320 cpuid_mask(&entry->ebx, 9);
321 } else
322 entry->ebx = 0;
323 entry->eax = 0;
324 entry->ecx = 0;
325 entry->edx = 0;
326 break;
327 }
328 case 9:
329 break;
330 case 0xa: { /* Architectural Performance Monitoring */
331 struct x86_pmu_capability cap;
332 union cpuid10_eax eax;
333 union cpuid10_edx edx;
334
335 perf_get_x86_pmu_capability(&cap);
336
337 /*
338 * Only support guest architectural pmu on a host
339 * with architectural pmu.
340 */
341 if (!cap.version)
342 memset(&cap, 0, sizeof(cap));
343
344 eax.split.version_id = min(cap.version, 2);
345 eax.split.num_counters = cap.num_counters_gp;
346 eax.split.bit_width = cap.bit_width_gp;
347 eax.split.mask_length = cap.events_mask_len;
348
349 edx.split.num_counters_fixed = cap.num_counters_fixed;
350 edx.split.bit_width_fixed = cap.bit_width_fixed;
351 edx.split.reserved = 0;
352
353 entry->eax = eax.full;
354 entry->ebx = cap.events_mask;
355 entry->ecx = 0;
356 entry->edx = edx.full;
357 break;
358 }
359 /* function 0xb has additional index. */
360 case 0xb: {
361 int i, level_type;
362
363 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
364 /* read more entries until level_type is zero */
365 for (i = 1; ; ++i) {
366 if (*nent >= maxnent)
367 goto out;
368
369 level_type = entry[i - 1].ecx & 0xff00;
370 if (!level_type)
371 break;
372 do_cpuid_1_ent(&entry[i], function, i);
373 entry[i].flags |=
374 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
375 ++*nent;
376 }
377 break;
378 }
379 case 0xd: {
380 int idx, i;
381
382 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
383 for (idx = 1, i = 1; idx < 64; ++idx) {
384 if (*nent >= maxnent)
385 goto out;
386
387 do_cpuid_1_ent(&entry[i], function, idx);
388 if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
389 continue;
390 entry[i].flags |=
391 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
392 ++*nent;
393 ++i;
394 }
395 break;
396 }
397 case KVM_CPUID_SIGNATURE: {
398 char signature[12] = "KVMKVMKVM\0\0";
399 u32 *sigptr = (u32 *)signature;
400 entry->eax = 0;
401 entry->ebx = sigptr[0];
402 entry->ecx = sigptr[1];
403 entry->edx = sigptr[2];
404 break;
405 }
406 case KVM_CPUID_FEATURES:
407 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
408 (1 << KVM_FEATURE_NOP_IO_DELAY) |
409 (1 << KVM_FEATURE_CLOCKSOURCE2) |
410 (1 << KVM_FEATURE_ASYNC_PF) |
411 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
412
413 if (sched_info_on())
414 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
415
416 entry->ebx = 0;
417 entry->ecx = 0;
418 entry->edx = 0;
419 break;
420 case 0x80000000:
421 entry->eax = min(entry->eax, 0x8000001a);
422 break;
423 case 0x80000001:
424 entry->edx &= kvm_supported_word1_x86_features;
425 cpuid_mask(&entry->edx, 1);
426 entry->ecx &= kvm_supported_word6_x86_features;
427 cpuid_mask(&entry->ecx, 6);
428 break;
429 case 0x80000008: {
430 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
431 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
432 unsigned phys_as = entry->eax & 0xff;
433
434 if (!g_phys_as)
435 g_phys_as = phys_as;
436 entry->eax = g_phys_as | (virt_as << 8);
437 entry->ebx = entry->edx = 0;
438 break;
439 }
440 case 0x80000019:
441 entry->ecx = entry->edx = 0;
442 break;
443 case 0x8000001a:
444 break;
445 case 0x8000001d:
446 break;
447 /*Add support for Centaur's CPUID instruction*/
448 case 0xC0000000:
449 /*Just support up to 0xC0000004 now*/
450 entry->eax = min(entry->eax, 0xC0000004);
451 break;
452 case 0xC0000001:
453 entry->edx &= kvm_supported_word5_x86_features;
454 cpuid_mask(&entry->edx, 5);
455 break;
456 case 3: /* Processor serial number */
457 case 5: /* MONITOR/MWAIT */
458 case 6: /* Thermal management */
459 case 0x80000007: /* Advanced power management */
460 case 0xC0000002:
461 case 0xC0000003:
462 case 0xC0000004:
463 default:
464 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
465 break;
466 }
467
468 kvm_x86_ops->set_supported_cpuid(function, entry);
469
470 r = 0;
471
472out:
473 put_cpu();
474
475 return r;
476}
477
478#undef F
479
480struct kvm_cpuid_param {
481 u32 func;
482 u32 idx;
483 bool has_leaf_count;
484 bool (*qualifier)(struct kvm_cpuid_param *param);
485};
486
487static bool is_centaur_cpu(struct kvm_cpuid_param *param)
488{
489 return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
490}
491
492int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
493 struct kvm_cpuid_entry2 __user *entries)
494{
495 struct kvm_cpuid_entry2 *cpuid_entries;
496 int limit, nent = 0, r = -E2BIG, i;
497 u32 func;
498 static struct kvm_cpuid_param param[] = {
499 { .func = 0, .has_leaf_count = true },
500 { .func = 0x80000000, .has_leaf_count = true },
501 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
502 { .func = KVM_CPUID_SIGNATURE },
503 { .func = KVM_CPUID_FEATURES },
504 };
505
506 if (cpuid->nent < 1)
507 goto out;
508 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
509 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
510 r = -ENOMEM;
511 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
512 if (!cpuid_entries)
513 goto out;
514
515 r = 0;
516 for (i = 0; i < ARRAY_SIZE(param); i++) {
517 struct kvm_cpuid_param *ent = &param[i];
518
519 if (ent->qualifier && !ent->qualifier(ent))
520 continue;
521
522 r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
523 &nent, cpuid->nent);
524
525 if (r)
526 goto out_free;
527
528 if (!ent->has_leaf_count)
529 continue;
530
531 limit = cpuid_entries[nent - 1].eax;
532 for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
533 r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
534 &nent, cpuid->nent);
535
536 if (r)
537 goto out_free;
538 }
539
540 r = -EFAULT;
541 if (copy_to_user(entries, cpuid_entries,
542 nent * sizeof(struct kvm_cpuid_entry2)))
543 goto out_free;
544 cpuid->nent = nent;
545 r = 0;
546
547out_free:
548 vfree(cpuid_entries);
549out:
550 return r;
551}
552
553static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
554{
555 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
556 int j, nent = vcpu->arch.cpuid_nent;
557
558 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
559 /* when no next entry is found, the current entry[i] is reselected */
560 for (j = i + 1; ; j = (j + 1) % nent) {
561 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
562 if (ej->function == e->function) {
563 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
564 return j;
565 }
566 }
567 return 0; /* silence gcc, even though control never reaches here */
568}
569
570/* find an entry with matching function, matching index (if needed), and that
571 * should be read next (if it's stateful) */
572static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
573 u32 function, u32 index)
574{
575 if (e->function != function)
576 return 0;
577 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
578 return 0;
579 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
580 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
581 return 0;
582 return 1;
583}
584
585struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
586 u32 function, u32 index)
587{
588 int i;
589 struct kvm_cpuid_entry2 *best = NULL;
590
591 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
592 struct kvm_cpuid_entry2 *e;
593
594 e = &vcpu->arch.cpuid_entries[i];
595 if (is_matching_cpuid_entry(e, function, index)) {
596 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
597 move_to_next_stateful_cpuid_entry(vcpu, i);
598 best = e;
599 break;
600 }
601 }
602 return best;
603}
604EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
605
606int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
607{
608 struct kvm_cpuid_entry2 *best;
609
610 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
611 if (!best || best->eax < 0x80000008)
612 goto not_found;
613 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
614 if (best)
615 return best->eax & 0xff;
616not_found:
617 return 36;
618}
619
620/*
621 * If no match is found, check whether we exceed the vCPU's limit
622 * and return the content of the highest valid _standard_ leaf instead.
623 * This is to satisfy the CPUID specification.
624 */
625static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
626 u32 function, u32 index)
627{
628 struct kvm_cpuid_entry2 *maxlevel;
629
630 maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
631 if (!maxlevel || maxlevel->eax >= function)
632 return NULL;
633 if (function & 0x80000000) {
634 maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
635 if (!maxlevel)
636 return NULL;
637 }
638 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
639}
640
641void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
642{
643 u32 function, index;
644 struct kvm_cpuid_entry2 *best;
645
646 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
647 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
648 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
649 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
650 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
651 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
652 best = kvm_find_cpuid_entry(vcpu, function, index);
653
654 if (!best)
655 best = check_cpuid_limit(vcpu, function, index);
656
657 if (best) {
658 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
659 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
660 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
661 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
662 }
663 kvm_x86_ops->skip_emulated_instruction(vcpu);
664 trace_kvm_cpuid(function,
665 kvm_register_read(vcpu, VCPU_REGS_RAX),
666 kvm_register_read(vcpu, VCPU_REGS_RBX),
667 kvm_register_read(vcpu, VCPU_REGS_RCX),
668 kvm_register_read(vcpu, VCPU_REGS_RDX));
669}
670EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000000..5b97e1797a6d
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,46 @@
1#ifndef ARCH_X86_KVM_CPUID_H
2#define ARCH_X86_KVM_CPUID_H
3
4#include "x86.h"
5
6void kvm_update_cpuid(struct kvm_vcpu *vcpu);
7struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
8 u32 function, u32 index);
9int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
10 struct kvm_cpuid_entry2 __user *entries);
11int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
12 struct kvm_cpuid *cpuid,
13 struct kvm_cpuid_entry __user *entries);
14int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
15 struct kvm_cpuid2 *cpuid,
16 struct kvm_cpuid_entry2 __user *entries);
17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
18 struct kvm_cpuid2 *cpuid,
19 struct kvm_cpuid_entry2 __user *entries);
20
21
22static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
23{
24 struct kvm_cpuid_entry2 *best;
25
26 best = kvm_find_cpuid_entry(vcpu, 1, 0);
27 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
28}
29
30static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
31{
32 struct kvm_cpuid_entry2 *best;
33
34 best = kvm_find_cpuid_entry(vcpu, 7, 0);
35 return best && (best->ebx & bit(X86_FEATURE_SMEP));
36}
37
38static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
39{
40 struct kvm_cpuid_entry2 *best;
41
42 best = kvm_find_cpuid_entry(vcpu, 7, 0);
43 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
44}
45
46#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f1e3be18a08f..05a562b85025 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -125,8 +125,9 @@
125#define Lock (1<<26) /* lock prefix is allowed for the instruction */ 125#define Lock (1<<26) /* lock prefix is allowed for the instruction */
126#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 126#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
127#define No64 (1<<28) 127#define No64 (1<<28)
128#define PageTable (1 << 29) /* instruction used to write page table */
128/* Source 2 operand type */ 129/* Source 2 operand type */
129#define Src2Shift (29) 130#define Src2Shift (30)
130#define Src2None (OpNone << Src2Shift) 131#define Src2None (OpNone << Src2Shift)
131#define Src2CL (OpCL << Src2Shift) 132#define Src2CL (OpCL << Src2Shift)
132#define Src2ImmByte (OpImmByte << Src2Shift) 133#define Src2ImmByte (OpImmByte << Src2Shift)
@@ -1674,11 +1675,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1674 return X86EMUL_CONTINUE; 1675 return X86EMUL_CONTINUE;
1675} 1676}
1676 1677
1677static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1678{
1679 return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
1680}
1681
1682static int em_grp2(struct x86_emulate_ctxt *ctxt) 1678static int em_grp2(struct x86_emulate_ctxt *ctxt)
1683{ 1679{
1684 switch (ctxt->modrm_reg) { 1680 switch (ctxt->modrm_reg) {
@@ -1788,7 +1784,7 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
1788 return rc; 1784 return rc;
1789} 1785}
1790 1786
1791static int em_grp9(struct x86_emulate_ctxt *ctxt) 1787static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
1792{ 1788{
1793 u64 old = ctxt->dst.orig_val64; 1789 u64 old = ctxt->dst.orig_val64;
1794 1790
@@ -1831,6 +1827,24 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
1831 return rc; 1827 return rc;
1832} 1828}
1833 1829
1830static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
1831{
1832 /* Save real source value, then compare EAX against destination. */
1833 ctxt->src.orig_val = ctxt->src.val;
1834 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
1835 emulate_2op_SrcV(ctxt, "cmp");
1836
1837 if (ctxt->eflags & EFLG_ZF) {
1838 /* Success: write back to memory. */
1839 ctxt->dst.val = ctxt->src.orig_val;
1840 } else {
1841 /* Failure: write the value we saw to EAX. */
1842 ctxt->dst.type = OP_REG;
1843 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
1844 }
1845 return X86EMUL_CONTINUE;
1846}
1847
1834static int em_lseg(struct x86_emulate_ctxt *ctxt) 1848static int em_lseg(struct x86_emulate_ctxt *ctxt)
1835{ 1849{
1836 int seg = ctxt->src2.val; 1850 int seg = ctxt->src2.val;
@@ -2481,6 +2495,15 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2481 return X86EMUL_CONTINUE; 2495 return X86EMUL_CONTINUE;
2482} 2496}
2483 2497
2498static int em_call(struct x86_emulate_ctxt *ctxt)
2499{
2500 long rel = ctxt->src.val;
2501
2502 ctxt->src.val = (unsigned long)ctxt->_eip;
2503 jmp_rel(ctxt, rel);
2504 return em_push(ctxt);
2505}
2506
2484static int em_call_far(struct x86_emulate_ctxt *ctxt) 2507static int em_call_far(struct x86_emulate_ctxt *ctxt)
2485{ 2508{
2486 u16 sel, old_cs; 2509 u16 sel, old_cs;
@@ -2622,12 +2645,75 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2622 return X86EMUL_CONTINUE; 2645 return X86EMUL_CONTINUE;
2623} 2646}
2624 2647
2648static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
2649{
2650 u64 pmc;
2651
2652 if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
2653 return emulate_gp(ctxt, 0);
2654 ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
2655 ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
2656 return X86EMUL_CONTINUE;
2657}
2658
2625static int em_mov(struct x86_emulate_ctxt *ctxt) 2659static int em_mov(struct x86_emulate_ctxt *ctxt)
2626{ 2660{
2627 ctxt->dst.val = ctxt->src.val; 2661 ctxt->dst.val = ctxt->src.val;
2628 return X86EMUL_CONTINUE; 2662 return X86EMUL_CONTINUE;
2629} 2663}
2630 2664
2665static int em_cr_write(struct x86_emulate_ctxt *ctxt)
2666{
2667 if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
2668 return emulate_gp(ctxt, 0);
2669
2670 /* Disable writeback. */
2671 ctxt->dst.type = OP_NONE;
2672 return X86EMUL_CONTINUE;
2673}
2674
2675static int em_dr_write(struct x86_emulate_ctxt *ctxt)
2676{
2677 unsigned long val;
2678
2679 if (ctxt->mode == X86EMUL_MODE_PROT64)
2680 val = ctxt->src.val & ~0ULL;
2681 else
2682 val = ctxt->src.val & ~0U;
2683
2684 /* #UD condition is already handled. */
2685 if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
2686 return emulate_gp(ctxt, 0);
2687
2688 /* Disable writeback. */
2689 ctxt->dst.type = OP_NONE;
2690 return X86EMUL_CONTINUE;
2691}
2692
2693static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
2694{
2695 u64 msr_data;
2696
2697 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
2698 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
2699 if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
2700 return emulate_gp(ctxt, 0);
2701
2702 return X86EMUL_CONTINUE;
2703}
2704
2705static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
2706{
2707 u64 msr_data;
2708
2709 if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
2710 return emulate_gp(ctxt, 0);
2711
2712 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
2713 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
2714 return X86EMUL_CONTINUE;
2715}
2716
2631static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) 2717static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
2632{ 2718{
2633 if (ctxt->modrm_reg > VCPU_SREG_GS) 2719 if (ctxt->modrm_reg > VCPU_SREG_GS)
@@ -2775,6 +2861,24 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt)
2775 return X86EMUL_CONTINUE; 2861 return X86EMUL_CONTINUE;
2776} 2862}
2777 2863
2864static int em_in(struct x86_emulate_ctxt *ctxt)
2865{
2866 if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
2867 &ctxt->dst.val))
2868 return X86EMUL_IO_NEEDED;
2869
2870 return X86EMUL_CONTINUE;
2871}
2872
2873static int em_out(struct x86_emulate_ctxt *ctxt)
2874{
2875 ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
2876 &ctxt->src.val, 1);
2877 /* Disable writeback. */
2878 ctxt->dst.type = OP_NONE;
2879 return X86EMUL_CONTINUE;
2880}
2881
2778static int em_cli(struct x86_emulate_ctxt *ctxt) 2882static int em_cli(struct x86_emulate_ctxt *ctxt)
2779{ 2883{
2780 if (emulator_bad_iopl(ctxt)) 2884 if (emulator_bad_iopl(ctxt))
@@ -2794,6 +2898,69 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
2794 return X86EMUL_CONTINUE; 2898 return X86EMUL_CONTINUE;
2795} 2899}
2796 2900
2901static int em_bt(struct x86_emulate_ctxt *ctxt)
2902{
2903 /* Disable writeback. */
2904 ctxt->dst.type = OP_NONE;
2905 /* only subword offset */
2906 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
2907
2908 emulate_2op_SrcV_nobyte(ctxt, "bt");
2909 return X86EMUL_CONTINUE;
2910}
2911
2912static int em_bts(struct x86_emulate_ctxt *ctxt)
2913{
2914 emulate_2op_SrcV_nobyte(ctxt, "bts");
2915 return X86EMUL_CONTINUE;
2916}
2917
2918static int em_btr(struct x86_emulate_ctxt *ctxt)
2919{
2920 emulate_2op_SrcV_nobyte(ctxt, "btr");
2921 return X86EMUL_CONTINUE;
2922}
2923
2924static int em_btc(struct x86_emulate_ctxt *ctxt)
2925{
2926 emulate_2op_SrcV_nobyte(ctxt, "btc");
2927 return X86EMUL_CONTINUE;
2928}
2929
2930static int em_bsf(struct x86_emulate_ctxt *ctxt)
2931{
2932 u8 zf;
2933
2934 __asm__ ("bsf %2, %0; setz %1"
2935 : "=r"(ctxt->dst.val), "=q"(zf)
2936 : "r"(ctxt->src.val));
2937
2938 ctxt->eflags &= ~X86_EFLAGS_ZF;
2939 if (zf) {
2940 ctxt->eflags |= X86_EFLAGS_ZF;
2941 /* Disable writeback. */
2942 ctxt->dst.type = OP_NONE;
2943 }
2944 return X86EMUL_CONTINUE;
2945}
2946
2947static int em_bsr(struct x86_emulate_ctxt *ctxt)
2948{
2949 u8 zf;
2950
2951 __asm__ ("bsr %2, %0; setz %1"
2952 : "=r"(ctxt->dst.val), "=q"(zf)
2953 : "r"(ctxt->src.val));
2954
2955 ctxt->eflags &= ~X86_EFLAGS_ZF;
2956 if (zf) {
2957 ctxt->eflags |= X86_EFLAGS_ZF;
2958 /* Disable writeback. */
2959 ctxt->dst.type = OP_NONE;
2960 }
2961 return X86EMUL_CONTINUE;
2962}
2963
2797static bool valid_cr(int nr) 2964static bool valid_cr(int nr)
2798{ 2965{
2799 switch (nr) { 2966 switch (nr) {
@@ -2867,9 +3034,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2867 break; 3034 break;
2868 } 3035 }
2869 case 4: { 3036 case 4: {
2870 u64 cr4;
2871
2872 cr4 = ctxt->ops->get_cr(ctxt, 4);
2873 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 3037 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2874 3038
2875 if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) 3039 if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
@@ -3003,6 +3167,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3003#define D2bv(_f) D((_f) | ByteOp), D(_f) 3167#define D2bv(_f) D((_f) | ByteOp), D(_f)
3004#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) 3168#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
3005#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) 3169#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
3170#define I2bvIP(_f, _e, _i, _p) \
3171 IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
3006 3172
3007#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ 3173#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
3008 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3174 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
@@ -3033,17 +3199,17 @@ static struct opcode group7_rm7[] = {
3033 3199
3034static struct opcode group1[] = { 3200static struct opcode group1[] = {
3035 I(Lock, em_add), 3201 I(Lock, em_add),
3036 I(Lock, em_or), 3202 I(Lock | PageTable, em_or),
3037 I(Lock, em_adc), 3203 I(Lock, em_adc),
3038 I(Lock, em_sbb), 3204 I(Lock, em_sbb),
3039 I(Lock, em_and), 3205 I(Lock | PageTable, em_and),
3040 I(Lock, em_sub), 3206 I(Lock, em_sub),
3041 I(Lock, em_xor), 3207 I(Lock, em_xor),
3042 I(0, em_cmp), 3208 I(0, em_cmp),
3043}; 3209};
3044 3210
3045static struct opcode group1A[] = { 3211static struct opcode group1A[] = {
3046 D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, 3212 I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3047}; 3213};
3048 3214
3049static struct opcode group3[] = { 3215static struct opcode group3[] = {
@@ -3058,16 +3224,19 @@ static struct opcode group3[] = {
3058}; 3224};
3059 3225
3060static struct opcode group4[] = { 3226static struct opcode group4[] = {
3061 D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), 3227 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
3228 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
3062 N, N, N, N, N, N, 3229 N, N, N, N, N, N,
3063}; 3230};
3064 3231
3065static struct opcode group5[] = { 3232static struct opcode group5[] = {
3066 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), 3233 I(DstMem | SrcNone | ModRM | Lock, em_grp45),
3067 D(SrcMem | ModRM | Stack), 3234 I(DstMem | SrcNone | ModRM | Lock, em_grp45),
3235 I(SrcMem | ModRM | Stack, em_grp45),
3068 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), 3236 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
3069 D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), 3237 I(SrcMem | ModRM | Stack, em_grp45),
3070 D(SrcMem | ModRM | Stack), N, 3238 I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
3239 I(SrcMem | ModRM | Stack, em_grp45), N,
3071}; 3240};
3072 3241
3073static struct opcode group6[] = { 3242static struct opcode group6[] = {
@@ -3096,18 +3265,21 @@ static struct group_dual group7 = { {
3096 3265
3097static struct opcode group8[] = { 3266static struct opcode group8[] = {
3098 N, N, N, N, 3267 N, N, N, N,
3099 D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), 3268 I(DstMem | SrcImmByte | ModRM, em_bt),
3100 D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), 3269 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
3270 I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
3271 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
3101}; 3272};
3102 3273
3103static struct group_dual group9 = { { 3274static struct group_dual group9 = { {
3104 N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, 3275 N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
3105}, { 3276}, {
3106 N, N, N, N, N, N, N, N, 3277 N, N, N, N, N, N, N, N,
3107} }; 3278} };
3108 3279
3109static struct opcode group11[] = { 3280static struct opcode group11[] = {
3110 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), 3281 I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
3282 X7(D(Undefined)),
3111}; 3283};
3112 3284
3113static struct gprefix pfx_0f_6f_0f_7f = { 3285static struct gprefix pfx_0f_6f_0f_7f = {
@@ -3120,7 +3292,7 @@ static struct opcode opcode_table[256] = {
3120 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), 3292 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
3121 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), 3293 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
3122 /* 0x08 - 0x0F */ 3294 /* 0x08 - 0x0F */
3123 I6ALU(Lock, em_or), 3295 I6ALU(Lock | PageTable, em_or),
3124 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), 3296 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
3125 N, 3297 N,
3126 /* 0x10 - 0x17 */ 3298 /* 0x10 - 0x17 */
@@ -3132,7 +3304,7 @@ static struct opcode opcode_table[256] = {
3132 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), 3304 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
3133 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), 3305 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
3134 /* 0x20 - 0x27 */ 3306 /* 0x20 - 0x27 */
3135 I6ALU(Lock, em_and), N, N, 3307 I6ALU(Lock | PageTable, em_and), N, N,
3136 /* 0x28 - 0x2F */ 3308 /* 0x28 - 0x2F */
3137 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), 3309 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
3138 /* 0x30 - 0x37 */ 3310 /* 0x30 - 0x37 */
@@ -3155,8 +3327,8 @@ static struct opcode opcode_table[256] = {
3155 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), 3327 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
3156 I(SrcImmByte | Mov | Stack, em_push), 3328 I(SrcImmByte | Mov | Stack, em_push),
3157 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), 3329 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
3158 D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */ 3330 I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
3159 D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */ 3331 I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
3160 /* 0x70 - 0x7F */ 3332 /* 0x70 - 0x7F */
3161 X16(D(SrcImmByte)), 3333 X16(D(SrcImmByte)),
3162 /* 0x80 - 0x87 */ 3334 /* 0x80 - 0x87 */
@@ -3165,11 +3337,11 @@ static struct opcode opcode_table[256] = {
3165 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), 3337 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
3166 G(DstMem | SrcImmByte | ModRM | Group, group1), 3338 G(DstMem | SrcImmByte | ModRM | Group, group1),
3167 I2bv(DstMem | SrcReg | ModRM, em_test), 3339 I2bv(DstMem | SrcReg | ModRM, em_test),
3168 I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg), 3340 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
3169 /* 0x88 - 0x8F */ 3341 /* 0x88 - 0x8F */
3170 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), 3342 I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
3171 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), 3343 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
3172 I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg), 3344 I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
3173 D(ModRM | SrcMem | NoAccess | DstReg), 3345 D(ModRM | SrcMem | NoAccess | DstReg),
3174 I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), 3346 I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
3175 G(0, group1A), 3347 G(0, group1A),
@@ -3182,7 +3354,7 @@ static struct opcode opcode_table[256] = {
3182 II(ImplicitOps | Stack, em_popf, popf), N, N, 3354 II(ImplicitOps | Stack, em_popf, popf), N, N,
3183 /* 0xA0 - 0xA7 */ 3355 /* 0xA0 - 0xA7 */
3184 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3356 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3185 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), 3357 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
3186 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3358 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3187 I2bv(SrcSI | DstDI | String, em_cmp), 3359 I2bv(SrcSI | DstDI | String, em_cmp),
3188 /* 0xA8 - 0xAF */ 3360 /* 0xA8 - 0xAF */
@@ -3213,13 +3385,13 @@ static struct opcode opcode_table[256] = {
3213 /* 0xE0 - 0xE7 */ 3385 /* 0xE0 - 0xE7 */
3214 X3(I(SrcImmByte, em_loop)), 3386 X3(I(SrcImmByte, em_loop)),
3215 I(SrcImmByte, em_jcxz), 3387 I(SrcImmByte, em_jcxz),
3216 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), 3388 I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
3217 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), 3389 I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
3218 /* 0xE8 - 0xEF */ 3390 /* 0xE8 - 0xEF */
3219 D(SrcImm | Stack), D(SrcImm | ImplicitOps), 3391 I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps),
3220 I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), 3392 I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
3221 D2bvIP(SrcDX | DstAcc, in, check_perm_in), 3393 I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
3222 D2bvIP(SrcAcc | DstDX, out, check_perm_out), 3394 I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
3223 /* 0xF0 - 0xF7 */ 3395 /* 0xF0 - 0xF7 */
3224 N, DI(ImplicitOps, icebp), N, N, 3396 N, DI(ImplicitOps, icebp), N, N,
3225 DI(ImplicitOps | Priv, hlt), D(ImplicitOps), 3397 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
@@ -3242,15 +3414,15 @@ static struct opcode twobyte_table[256] = {
3242 /* 0x20 - 0x2F */ 3414 /* 0x20 - 0x2F */
3243 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), 3415 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
3244 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), 3416 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
3245 DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), 3417 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
3246 DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), 3418 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
3247 N, N, N, N, 3419 N, N, N, N,
3248 N, N, N, N, N, N, N, N, 3420 N, N, N, N, N, N, N, N,
3249 /* 0x30 - 0x3F */ 3421 /* 0x30 - 0x3F */
3250 DI(ImplicitOps | Priv, wrmsr), 3422 II(ImplicitOps | Priv, em_wrmsr, wrmsr),
3251 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 3423 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3252 DI(ImplicitOps | Priv, rdmsr), 3424 II(ImplicitOps | Priv, em_rdmsr, rdmsr),
3253 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), 3425 IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
3254 I(ImplicitOps | VendorSpecific, em_sysenter), 3426 I(ImplicitOps | VendorSpecific, em_sysenter),
3255 I(ImplicitOps | Priv | VendorSpecific, em_sysexit), 3427 I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
3256 N, N, 3428 N, N,
@@ -3275,26 +3447,28 @@ static struct opcode twobyte_table[256] = {
3275 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3447 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3276 /* 0xA0 - 0xA7 */ 3448 /* 0xA0 - 0xA7 */
3277 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3449 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3278 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), 3450 DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
3279 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3451 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3280 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3452 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3281 /* 0xA8 - 0xAF */ 3453 /* 0xA8 - 0xAF */
3282 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), 3454 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
3283 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), 3455 DI(ImplicitOps, rsm),
3456 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
3284 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3457 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3285 D(DstMem | SrcReg | Src2CL | ModRM), 3458 D(DstMem | SrcReg | Src2CL | ModRM),
3286 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 3459 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
3287 /* 0xB0 - 0xB7 */ 3460 /* 0xB0 - 0xB7 */
3288 D2bv(DstMem | SrcReg | ModRM | Lock), 3461 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
3289 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), 3462 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
3290 D(DstMem | SrcReg | ModRM | BitOp | Lock), 3463 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
3291 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 3464 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3292 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), 3465 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3293 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3466 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3294 /* 0xB8 - 0xBF */ 3467 /* 0xB8 - 0xBF */
3295 N, N, 3468 N, N,
3296 G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), 3469 G(BitOp, group8),
3297 D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), 3470 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3471 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3298 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3472 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3299 /* 0xC0 - 0xCF */ 3473 /* 0xC0 - 0xCF */
3300 D2bv(DstMem | SrcReg | ModRM | Lock), 3474 D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -3320,6 +3494,7 @@ static struct opcode twobyte_table[256] = {
3320#undef D2bv 3494#undef D2bv
3321#undef D2bvIP 3495#undef D2bvIP
3322#undef I2bv 3496#undef I2bv
3497#undef I2bvIP
3323#undef I6ALU 3498#undef I6ALU
3324 3499
3325static unsigned imm_size(struct x86_emulate_ctxt *ctxt) 3500static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
@@ -3697,6 +3872,11 @@ done:
3697 return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; 3872 return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
3698} 3873}
3699 3874
3875bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
3876{
3877 return ctxt->d & PageTable;
3878}
3879
3700static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3880static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3701{ 3881{
3702 /* The second termination condition only applies for REPE 3882 /* The second termination condition only applies for REPE
@@ -3720,7 +3900,6 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3720int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 3900int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3721{ 3901{
3722 struct x86_emulate_ops *ops = ctxt->ops; 3902 struct x86_emulate_ops *ops = ctxt->ops;
3723 u64 msr_data;
3724 int rc = X86EMUL_CONTINUE; 3903 int rc = X86EMUL_CONTINUE;
3725 int saved_dst_type = ctxt->dst.type; 3904 int saved_dst_type = ctxt->dst.type;
3726 3905
@@ -3854,15 +4033,6 @@ special_insn:
3854 goto cannot_emulate; 4033 goto cannot_emulate;
3855 ctxt->dst.val = (s32) ctxt->src.val; 4034 ctxt->dst.val = (s32) ctxt->src.val;
3856 break; 4035 break;
3857 case 0x6c: /* insb */
3858 case 0x6d: /* insw/insd */
3859 ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
3860 goto do_io_in;
3861 case 0x6e: /* outsb */
3862 case 0x6f: /* outsw/outsd */
3863 ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
3864 goto do_io_out;
3865 break;
3866 case 0x70 ... 0x7f: /* jcc (short) */ 4036 case 0x70 ... 0x7f: /* jcc (short) */
3867 if (test_cc(ctxt->b, ctxt->eflags)) 4037 if (test_cc(ctxt->b, ctxt->eflags))
3868 jmp_rel(ctxt, ctxt->src.val); 4038 jmp_rel(ctxt, ctxt->src.val);
@@ -3870,9 +4040,6 @@ special_insn:
3870 case 0x8d: /* lea r16/r32, m */ 4040 case 0x8d: /* lea r16/r32, m */
3871 ctxt->dst.val = ctxt->src.addr.mem.ea; 4041 ctxt->dst.val = ctxt->src.addr.mem.ea;
3872 break; 4042 break;
3873 case 0x8f: /* pop (sole member of Grp1a) */
3874 rc = em_grp1a(ctxt);
3875 break;
3876 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 4043 case 0x90 ... 0x97: /* nop / xchg reg, rax */
3877 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) 4044 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
3878 break; 4045 break;
@@ -3905,38 +4072,11 @@ special_insn:
3905 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; 4072 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
3906 rc = em_grp2(ctxt); 4073 rc = em_grp2(ctxt);
3907 break; 4074 break;
3908 case 0xe4: /* inb */
3909 case 0xe5: /* in */
3910 goto do_io_in;
3911 case 0xe6: /* outb */
3912 case 0xe7: /* out */
3913 goto do_io_out;
3914 case 0xe8: /* call (near) */ {
3915 long int rel = ctxt->src.val;
3916 ctxt->src.val = (unsigned long) ctxt->_eip;
3917 jmp_rel(ctxt, rel);
3918 rc = em_push(ctxt);
3919 break;
3920 }
3921 case 0xe9: /* jmp rel */ 4075 case 0xe9: /* jmp rel */
3922 case 0xeb: /* jmp rel short */ 4076 case 0xeb: /* jmp rel short */
3923 jmp_rel(ctxt, ctxt->src.val); 4077 jmp_rel(ctxt, ctxt->src.val);
3924 ctxt->dst.type = OP_NONE; /* Disable writeback. */ 4078 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3925 break; 4079 break;
3926 case 0xec: /* in al,dx */
3927 case 0xed: /* in (e/r)ax,dx */
3928 do_io_in:
3929 if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
3930 &ctxt->dst.val))
3931 goto done; /* IO is needed */
3932 break;
3933 case 0xee: /* out dx,al */
3934 case 0xef: /* out dx,(e/r)ax */
3935 do_io_out:
3936 ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
3937 &ctxt->src.val, 1);
3938 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3939 break;
3940 case 0xf4: /* hlt */ 4080 case 0xf4: /* hlt */
3941 ctxt->ops->halt(ctxt); 4081 ctxt->ops->halt(ctxt);
3942 break; 4082 break;
@@ -3956,12 +4096,6 @@ special_insn:
3956 case 0xfd: /* std */ 4096 case 0xfd: /* std */
3957 ctxt->eflags |= EFLG_DF; 4097 ctxt->eflags |= EFLG_DF;
3958 break; 4098 break;
3959 case 0xfe: /* Grp4 */
3960 rc = em_grp45(ctxt);
3961 break;
3962 case 0xff: /* Grp5 */
3963 rc = em_grp45(ctxt);
3964 break;
3965 default: 4099 default:
3966 goto cannot_emulate; 4100 goto cannot_emulate;
3967 } 4101 }
@@ -4036,49 +4170,6 @@ twobyte_insn:
4036 case 0x21: /* mov from dr to reg */ 4170 case 0x21: /* mov from dr to reg */
4037 ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); 4171 ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
4038 break; 4172 break;
4039 case 0x22: /* mov reg, cr */
4040 if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
4041 emulate_gp(ctxt, 0);
4042 rc = X86EMUL_PROPAGATE_FAULT;
4043 goto done;
4044 }
4045 ctxt->dst.type = OP_NONE;
4046 break;
4047 case 0x23: /* mov from reg to dr */
4048 if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
4049 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
4050 ~0ULL : ~0U)) < 0) {
4051 /* #UD condition is already handled by the code above */
4052 emulate_gp(ctxt, 0);
4053 rc = X86EMUL_PROPAGATE_FAULT;
4054 goto done;
4055 }
4056
4057 ctxt->dst.type = OP_NONE; /* no writeback */
4058 break;
4059 case 0x30:
4060 /* wrmsr */
4061 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
4062 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
4063 if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
4064 emulate_gp(ctxt, 0);
4065 rc = X86EMUL_PROPAGATE_FAULT;
4066 goto done;
4067 }
4068 rc = X86EMUL_CONTINUE;
4069 break;
4070 case 0x32:
4071 /* rdmsr */
4072 if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
4073 emulate_gp(ctxt, 0);
4074 rc = X86EMUL_PROPAGATE_FAULT;
4075 goto done;
4076 } else {
4077 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
4078 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
4079 }
4080 rc = X86EMUL_CONTINUE;
4081 break;
4082 case 0x40 ... 0x4f: /* cmov */ 4173 case 0x40 ... 0x4f: /* cmov */
4083 ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; 4174 ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
4084 if (!test_cc(ctxt->b, ctxt->eflags)) 4175 if (!test_cc(ctxt->b, ctxt->eflags))
@@ -4091,93 +4182,21 @@ twobyte_insn:
4091 case 0x90 ... 0x9f: /* setcc r/m8 */ 4182 case 0x90 ... 0x9f: /* setcc r/m8 */
4092 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); 4183 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4093 break; 4184 break;
4094 case 0xa3:
4095 bt: /* bt */
4096 ctxt->dst.type = OP_NONE;
4097 /* only subword offset */
4098 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
4099 emulate_2op_SrcV_nobyte(ctxt, "bt");
4100 break;
4101 case 0xa4: /* shld imm8, r, r/m */ 4185 case 0xa4: /* shld imm8, r, r/m */
4102 case 0xa5: /* shld cl, r, r/m */ 4186 case 0xa5: /* shld cl, r, r/m */
4103 emulate_2op_cl(ctxt, "shld"); 4187 emulate_2op_cl(ctxt, "shld");
4104 break; 4188 break;
4105 case 0xab:
4106 bts: /* bts */
4107 emulate_2op_SrcV_nobyte(ctxt, "bts");
4108 break;
4109 case 0xac: /* shrd imm8, r, r/m */ 4189 case 0xac: /* shrd imm8, r, r/m */
4110 case 0xad: /* shrd cl, r, r/m */ 4190 case 0xad: /* shrd cl, r, r/m */
4111 emulate_2op_cl(ctxt, "shrd"); 4191 emulate_2op_cl(ctxt, "shrd");
4112 break; 4192 break;
4113 case 0xae: /* clflush */ 4193 case 0xae: /* clflush */
4114 break; 4194 break;
4115 case 0xb0 ... 0xb1: /* cmpxchg */
4116 /*
4117 * Save real source value, then compare EAX against
4118 * destination.
4119 */
4120 ctxt->src.orig_val = ctxt->src.val;
4121 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
4122 emulate_2op_SrcV(ctxt, "cmp");
4123 if (ctxt->eflags & EFLG_ZF) {
4124 /* Success: write back to memory. */
4125 ctxt->dst.val = ctxt->src.orig_val;
4126 } else {
4127 /* Failure: write the value we saw to EAX. */
4128 ctxt->dst.type = OP_REG;
4129 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
4130 }
4131 break;
4132 case 0xb3:
4133 btr: /* btr */
4134 emulate_2op_SrcV_nobyte(ctxt, "btr");
4135 break;
4136 case 0xb6 ... 0xb7: /* movzx */ 4195 case 0xb6 ... 0xb7: /* movzx */
4137 ctxt->dst.bytes = ctxt->op_bytes; 4196 ctxt->dst.bytes = ctxt->op_bytes;
4138 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val 4197 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
4139 : (u16) ctxt->src.val; 4198 : (u16) ctxt->src.val;
4140 break; 4199 break;
4141 case 0xba: /* Grp8 */
4142 switch (ctxt->modrm_reg & 3) {
4143 case 0:
4144 goto bt;
4145 case 1:
4146 goto bts;
4147 case 2:
4148 goto btr;
4149 case 3:
4150 goto btc;
4151 }
4152 break;
4153 case 0xbb:
4154 btc: /* btc */
4155 emulate_2op_SrcV_nobyte(ctxt, "btc");
4156 break;
4157 case 0xbc: { /* bsf */
4158 u8 zf;
4159 __asm__ ("bsf %2, %0; setz %1"
4160 : "=r"(ctxt->dst.val), "=q"(zf)
4161 : "r"(ctxt->src.val));
4162 ctxt->eflags &= ~X86_EFLAGS_ZF;
4163 if (zf) {
4164 ctxt->eflags |= X86_EFLAGS_ZF;
4165 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4166 }
4167 break;
4168 }
4169 case 0xbd: { /* bsr */
4170 u8 zf;
4171 __asm__ ("bsr %2, %0; setz %1"
4172 : "=r"(ctxt->dst.val), "=q"(zf)
4173 : "r"(ctxt->src.val));
4174 ctxt->eflags &= ~X86_EFLAGS_ZF;
4175 if (zf) {
4176 ctxt->eflags |= X86_EFLAGS_ZF;
4177 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4178 }
4179 break;
4180 }
4181 case 0xbe ... 0xbf: /* movsx */ 4200 case 0xbe ... 0xbf: /* movsx */
4182 ctxt->dst.bytes = ctxt->op_bytes; 4201 ctxt->dst.bytes = ctxt->op_bytes;
4183 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : 4202 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
@@ -4194,9 +4213,6 @@ twobyte_insn:
4194 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : 4213 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
4195 (u64) ctxt->src.val; 4214 (u64) ctxt->src.val;
4196 break; 4215 break;
4197 case 0xc7: /* Grp9 (cmpxchg8b) */
4198 rc = em_grp9(ctxt);
4199 break;
4200 default: 4216 default:
4201 goto cannot_emulate; 4217 goto cannot_emulate;
4202 } 4218 }
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 405f2620392f..d68f99df690c 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -344,7 +344,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
344 struct kvm_timer *pt = &ps->pit_timer; 344 struct kvm_timer *pt = &ps->pit_timer;
345 s64 interval; 345 s64 interval;
346 346
347 if (!irqchip_in_kernel(kvm)) 347 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
348 return; 348 return;
349 349
350 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 350 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -397,15 +397,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
397 case 1: 397 case 1:
398 /* FIXME: enhance mode 4 precision */ 398 /* FIXME: enhance mode 4 precision */
399 case 4: 399 case 4:
400 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { 400 create_pit_timer(kvm, val, 0);
401 create_pit_timer(kvm, val, 0);
402 }
403 break; 401 break;
404 case 2: 402 case 2:
405 case 3: 403 case 3:
406 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ 404 create_pit_timer(kvm, val, 1);
407 create_pit_timer(kvm, val, 1);
408 }
409 break; 405 break;
410 default: 406 default:
411 destroy_pit_timer(kvm->arch.vpit); 407 destroy_pit_timer(kvm->arch.vpit);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cac4746d7ffb..b6a73537e1ef 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -262,9 +262,10 @@ int kvm_pic_read_irq(struct kvm *kvm)
262 262
263void kvm_pic_reset(struct kvm_kpic_state *s) 263void kvm_pic_reset(struct kvm_kpic_state *s)
264{ 264{
265 int irq; 265 int irq, i;
266 struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; 266 struct kvm_vcpu *vcpu;
267 u8 irr = s->irr, isr = s->imr; 267 u8 irr = s->irr, isr = s->imr;
268 bool found = false;
268 269
269 s->last_irr = 0; 270 s->last_irr = 0;
270 s->irr = 0; 271 s->irr = 0;
@@ -281,12 +282,19 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
281 s->special_fully_nested_mode = 0; 282 s->special_fully_nested_mode = 0;
282 s->init4 = 0; 283 s->init4 = 0;
283 284
284 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { 285 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
285 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) 286 if (kvm_apic_accept_pic_intr(vcpu)) {
286 if (irr & (1 << irq) || isr & (1 << irq)) { 287 found = true;
287 pic_clear_isr(s, irq); 288 break;
288 } 289 }
289 } 290
291
292 if (!found)
293 return;
294
295 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
296 if (irr & (1 << irq) || isr & (1 << irq))
297 pic_clear_isr(s, irq);
290} 298}
291 299
292static void pic_ioport_write(void *opaque, u32 addr, u32 val) 300static void pic_ioport_write(void *opaque, u32 addr, u32 val)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 54abb40199d6..cfdc6e0ef002 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -38,6 +38,7 @@
38#include "irq.h" 38#include "irq.h"
39#include "trace.h" 39#include "trace.h"
40#include "x86.h" 40#include "x86.h"
41#include "cpuid.h"
41 42
42#ifndef CONFIG_X86_64 43#ifndef CONFIG_X86_64
43#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) 44#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -1120,7 +1121,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
1120 return 0; 1121 return 0;
1121} 1122}
1122 1123
1123static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) 1124int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
1124{ 1125{
1125 u32 reg = apic_get_reg(apic, lvt_type); 1126 u32 reg = apic_get_reg(apic, lvt_type);
1126 int vector, mode, trig_mode; 1127 int vector, mode, trig_mode;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 138e8cc6fea6..6f4ce2575d09 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -34,6 +34,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
34int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 34int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
35int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 35int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
36int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 36int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
37int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
37 38
38u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
39void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f1b36cf3e3d0..224b02c3cda9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -59,15 +59,6 @@ enum {
59 AUDIT_POST_SYNC 59 AUDIT_POST_SYNC
60}; 60};
61 61
62char *audit_point_name[] = {
63 "pre page fault",
64 "post page fault",
65 "pre pte write",
66 "post pte write",
67 "pre sync",
68 "post sync"
69};
70
71#undef MMU_DEBUG 62#undef MMU_DEBUG
72 63
73#ifdef MMU_DEBUG 64#ifdef MMU_DEBUG
@@ -83,13 +74,10 @@ char *audit_point_name[] = {
83#endif 74#endif
84 75
85#ifdef MMU_DEBUG 76#ifdef MMU_DEBUG
86static int dbg = 0; 77static bool dbg = 0;
87module_param(dbg, bool, 0644); 78module_param(dbg, bool, 0644);
88#endif 79#endif
89 80
90static int oos_shadow = 1;
91module_param(oos_shadow, bool, 0644);
92
93#ifndef MMU_DEBUG 81#ifndef MMU_DEBUG
94#define ASSERT(x) do { } while (0) 82#define ASSERT(x) do { } while (0)
95#else 83#else
@@ -593,6 +581,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
593 return 0; 581 return 0;
594} 582}
595 583
584static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
585{
586 return cache->nobjs;
587}
588
596static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, 589static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
597 struct kmem_cache *cache) 590 struct kmem_cache *cache)
598{ 591{
@@ -953,21 +946,35 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
953 } 946 }
954} 947}
955 948
949static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
950 struct kvm_memory_slot *slot)
951{
952 struct kvm_lpage_info *linfo;
953
954 if (likely(level == PT_PAGE_TABLE_LEVEL))
955 return &slot->rmap[gfn - slot->base_gfn];
956
957 linfo = lpage_info_slot(gfn, slot, level);
958 return &linfo->rmap_pde;
959}
960
956/* 961/*
957 * Take gfn and return the reverse mapping to it. 962 * Take gfn and return the reverse mapping to it.
958 */ 963 */
959static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 964static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
960{ 965{
961 struct kvm_memory_slot *slot; 966 struct kvm_memory_slot *slot;
962 struct kvm_lpage_info *linfo;
963 967
964 slot = gfn_to_memslot(kvm, gfn); 968 slot = gfn_to_memslot(kvm, gfn);
965 if (likely(level == PT_PAGE_TABLE_LEVEL)) 969 return __gfn_to_rmap(kvm, gfn, level, slot);
966 return &slot->rmap[gfn - slot->base_gfn]; 970}
967 971
968 linfo = lpage_info_slot(gfn, slot, level); 972static bool rmap_can_add(struct kvm_vcpu *vcpu)
973{
974 struct kvm_mmu_memory_cache *cache;
969 975
970 return &linfo->rmap_pde; 976 cache = &vcpu->arch.mmu_pte_list_desc_cache;
977 return mmu_memory_cache_free_objects(cache);
971} 978}
972 979
973static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 980static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1004,17 +1011,16 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
1004 rmap_remove(kvm, sptep); 1011 rmap_remove(kvm, sptep);
1005} 1012}
1006 1013
1007static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1014int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1015 struct kvm_memory_slot *slot)
1008{ 1016{
1009 unsigned long *rmapp; 1017 unsigned long *rmapp;
1010 u64 *spte; 1018 u64 *spte;
1011 int i, write_protected = 0; 1019 int i, write_protected = 0;
1012 1020
1013 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 1021 rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
1014
1015 spte = rmap_next(kvm, rmapp, NULL); 1022 spte = rmap_next(kvm, rmapp, NULL);
1016 while (spte) { 1023 while (spte) {
1017 BUG_ON(!spte);
1018 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1024 BUG_ON(!(*spte & PT_PRESENT_MASK));
1019 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1025 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
1020 if (is_writable_pte(*spte)) { 1026 if (is_writable_pte(*spte)) {
@@ -1027,12 +1033,11 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
1027 /* check for huge page mappings */ 1033 /* check for huge page mappings */
1028 for (i = PT_DIRECTORY_LEVEL; 1034 for (i = PT_DIRECTORY_LEVEL;
1029 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1035 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1030 rmapp = gfn_to_rmap(kvm, gfn, i); 1036 rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
1031 spte = rmap_next(kvm, rmapp, NULL); 1037 spte = rmap_next(kvm, rmapp, NULL);
1032 while (spte) { 1038 while (spte) {
1033 BUG_ON(!spte);
1034 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1039 BUG_ON(!(*spte & PT_PRESENT_MASK));
1035 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 1040 BUG_ON(!is_large_pte(*spte));
1036 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 1041 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
1037 if (is_writable_pte(*spte)) { 1042 if (is_writable_pte(*spte)) {
1038 drop_spte(kvm, spte); 1043 drop_spte(kvm, spte);
@@ -1047,6 +1052,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
1047 return write_protected; 1052 return write_protected;
1048} 1053}
1049 1054
1055static int rmap_write_protect(struct kvm *kvm, u64 gfn)
1056{
1057 struct kvm_memory_slot *slot;
1058
1059 slot = gfn_to_memslot(kvm, gfn);
1060 return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
1061}
1062
1050static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1063static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1051 unsigned long data) 1064 unsigned long data)
1052{ 1065{
@@ -1103,15 +1116,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1103 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 1116 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
1104 unsigned long data)) 1117 unsigned long data))
1105{ 1118{
1106 int i, j; 1119 int j;
1107 int ret; 1120 int ret;
1108 int retval = 0; 1121 int retval = 0;
1109 struct kvm_memslots *slots; 1122 struct kvm_memslots *slots;
1123 struct kvm_memory_slot *memslot;
1110 1124
1111 slots = kvm_memslots(kvm); 1125 slots = kvm_memslots(kvm);
1112 1126
1113 for (i = 0; i < slots->nmemslots; i++) { 1127 kvm_for_each_memslot(memslot, slots) {
1114 struct kvm_memory_slot *memslot = &slots->memslots[i];
1115 unsigned long start = memslot->userspace_addr; 1128 unsigned long start = memslot->userspace_addr;
1116 unsigned long end; 1129 unsigned long end;
1117 1130
@@ -1324,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1324 PAGE_SIZE); 1337 PAGE_SIZE);
1325 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1338 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1326 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1339 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1327 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1340 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
1328 sp->parent_ptes = 0; 1341 sp->parent_ptes = 0;
1329 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1342 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1330 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1343 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1511,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1511 return ret; 1524 return ret;
1512} 1525}
1513 1526
1527#ifdef CONFIG_KVM_MMU_AUDIT
1528#include "mmu_audit.c"
1529#else
1530static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
1531static void mmu_audit_disable(void) { }
1532#endif
1533
1514static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1534static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1515 struct list_head *invalid_list) 1535 struct list_head *invalid_list)
1516{ 1536{
@@ -1640,6 +1660,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp)
1640 sp->spt[i] = 0ull; 1660 sp->spt[i] = 0ull;
1641} 1661}
1642 1662
1663static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
1664{
1665 sp->write_flooding_count = 0;
1666}
1667
1668static void clear_sp_write_flooding_count(u64 *spte)
1669{
1670 struct kvm_mmu_page *sp = page_header(__pa(spte));
1671
1672 __clear_sp_write_flooding_count(sp);
1673}
1674
1643static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1675static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1644 gfn_t gfn, 1676 gfn_t gfn,
1645 gva_t gaddr, 1677 gva_t gaddr,
@@ -1683,6 +1715,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1683 } else if (sp->unsync) 1715 } else if (sp->unsync)
1684 kvm_mmu_mark_parents_unsync(sp); 1716 kvm_mmu_mark_parents_unsync(sp);
1685 1717
1718 __clear_sp_write_flooding_count(sp);
1686 trace_kvm_mmu_get_page(sp, false); 1719 trace_kvm_mmu_get_page(sp, false);
1687 return sp; 1720 return sp;
1688 } 1721 }
@@ -1796,7 +1829,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1796 } 1829 }
1797} 1830}
1798 1831
1799static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 1832static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1800 u64 *spte) 1833 u64 *spte)
1801{ 1834{
1802 u64 pte; 1835 u64 pte;
@@ -1804,17 +1837,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1804 1837
1805 pte = *spte; 1838 pte = *spte;
1806 if (is_shadow_present_pte(pte)) { 1839 if (is_shadow_present_pte(pte)) {
1807 if (is_last_spte(pte, sp->role.level)) 1840 if (is_last_spte(pte, sp->role.level)) {
1808 drop_spte(kvm, spte); 1841 drop_spte(kvm, spte);
1809 else { 1842 if (is_large_pte(pte))
1843 --kvm->stat.lpages;
1844 } else {
1810 child = page_header(pte & PT64_BASE_ADDR_MASK); 1845 child = page_header(pte & PT64_BASE_ADDR_MASK);
1811 drop_parent_pte(child, spte); 1846 drop_parent_pte(child, spte);
1812 } 1847 }
1813 } else if (is_mmio_spte(pte)) 1848 return true;
1849 }
1850
1851 if (is_mmio_spte(pte))
1814 mmu_spte_clear_no_track(spte); 1852 mmu_spte_clear_no_track(spte);
1815 1853
1816 if (is_large_pte(pte)) 1854 return false;
1817 --kvm->stat.lpages;
1818} 1855}
1819 1856
1820static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1857static void kvm_mmu_page_unlink_children(struct kvm *kvm,
@@ -1831,15 +1868,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1831 mmu_page_remove_parent_pte(sp, parent_pte); 1868 mmu_page_remove_parent_pte(sp, parent_pte);
1832} 1869}
1833 1870
1834static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1835{
1836 int i;
1837 struct kvm_vcpu *vcpu;
1838
1839 kvm_for_each_vcpu(i, vcpu, kvm)
1840 vcpu->arch.last_pte_updated = NULL;
1841}
1842
1843static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1871static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1844{ 1872{
1845 u64 *parent_pte; 1873 u64 *parent_pte;
@@ -1899,7 +1927,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1899 } 1927 }
1900 1928
1901 sp->role.invalid = 1; 1929 sp->role.invalid = 1;
1902 kvm_mmu_reset_last_pte_updated(kvm);
1903 return ret; 1930 return ret;
1904} 1931}
1905 1932
@@ -1985,7 +2012,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1985 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2012 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1986} 2013}
1987 2014
1988static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2015int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1989{ 2016{
1990 struct kvm_mmu_page *sp; 2017 struct kvm_mmu_page *sp;
1991 struct hlist_node *node; 2018 struct hlist_node *node;
@@ -1994,7 +2021,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1994 2021
1995 pgprintk("%s: looking for gfn %llx\n", __func__, gfn); 2022 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1996 r = 0; 2023 r = 0;
1997 2024 spin_lock(&kvm->mmu_lock);
1998 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 2025 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1999 pgprintk("%s: gfn %llx role %x\n", __func__, gfn, 2026 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2000 sp->role.word); 2027 sp->role.word);
@@ -2002,22 +2029,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2002 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2029 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2003 } 2030 }
2004 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2031 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2005 return r; 2032 spin_unlock(&kvm->mmu_lock);
2006}
2007
2008static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
2009{
2010 struct kvm_mmu_page *sp;
2011 struct hlist_node *node;
2012 LIST_HEAD(invalid_list);
2013 2033
2014 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 2034 return r;
2015 pgprintk("%s: zap %llx %x\n",
2016 __func__, gfn, sp->role.word);
2017 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2018 }
2019 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2020} 2035}
2036EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2021 2037
2022static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 2038static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
2023{ 2039{
@@ -2169,8 +2185,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2169 return 1; 2185 return 1;
2170 2186
2171 if (!need_unsync && !s->unsync) { 2187 if (!need_unsync && !s->unsync) {
2172 if (!oos_shadow)
2173 return 1;
2174 need_unsync = true; 2188 need_unsync = true;
2175 } 2189 }
2176 } 2190 }
@@ -2191,11 +2205,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2191 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2205 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
2192 return 0; 2206 return 0;
2193 2207
2194 /*
2195 * We don't set the accessed bit, since we sometimes want to see
2196 * whether the guest actually used the pte (in order to detect
2197 * demand paging).
2198 */
2199 spte = PT_PRESENT_MASK; 2208 spte = PT_PRESENT_MASK;
2200 if (!speculative) 2209 if (!speculative)
2201 spte |= shadow_accessed_mask; 2210 spte |= shadow_accessed_mask;
@@ -2346,10 +2355,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2346 } 2355 }
2347 } 2356 }
2348 kvm_release_pfn_clean(pfn); 2357 kvm_release_pfn_clean(pfn);
2349 if (speculative) {
2350 vcpu->arch.last_pte_updated = sptep;
2351 vcpu->arch.last_pte_gfn = gfn;
2352 }
2353} 2358}
2354 2359
2355static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2360static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2840,12 +2845,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2840 return; 2845 return;
2841 2846
2842 vcpu_clear_mmio_info(vcpu, ~0ul); 2847 vcpu_clear_mmio_info(vcpu, ~0ul);
2843 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 2848 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2844 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 2849 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2845 hpa_t root = vcpu->arch.mmu.root_hpa; 2850 hpa_t root = vcpu->arch.mmu.root_hpa;
2846 sp = page_header(root); 2851 sp = page_header(root);
2847 mmu_sync_children(vcpu, sp); 2852 mmu_sync_children(vcpu, sp);
2848 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 2853 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2849 return; 2854 return;
2850 } 2855 }
2851 for (i = 0; i < 4; ++i) { 2856 for (i = 0; i < 4; ++i) {
@@ -2857,7 +2862,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2857 mmu_sync_children(vcpu, sp); 2862 mmu_sync_children(vcpu, sp);
2858 } 2863 }
2859 } 2864 }
2860 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 2865 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2861} 2866}
2862 2867
2863void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2868void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3510,28 +3515,119 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
3510 kvm_mmu_flush_tlb(vcpu); 3515 kvm_mmu_flush_tlb(vcpu);
3511} 3516}
3512 3517
3513static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) 3518static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3519 const u8 *new, int *bytes)
3514{ 3520{
3515 u64 *spte = vcpu->arch.last_pte_updated; 3521 u64 gentry;
3522 int r;
3523
3524 /*
3525 * Assume that the pte write on a page table of the same type
3526 * as the current vcpu paging mode since we update the sptes only
3527 * when they have the same mode.
3528 */
3529 if (is_pae(vcpu) && *bytes == 4) {
3530 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3531 *gpa &= ~(gpa_t)7;
3532 *bytes = 8;
3533 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
3534 if (r)
3535 gentry = 0;
3536 new = (const u8 *)&gentry;
3537 }
3516 3538
3517 return !!(spte && (*spte & shadow_accessed_mask)); 3539 switch (*bytes) {
3540 case 4:
3541 gentry = *(const u32 *)new;
3542 break;
3543 case 8:
3544 gentry = *(const u64 *)new;
3545 break;
3546 default:
3547 gentry = 0;
3548 break;
3549 }
3550
3551 return gentry;
3518} 3552}
3519 3553
3520static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3554/*
3555 * If we're seeing too many writes to a page, it may no longer be a page table,
3556 * or we may be forking, in which case it is better to unmap the page.
3557 */
3558static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
3521{ 3559{
3522 u64 *spte = vcpu->arch.last_pte_updated; 3560 /*
3561 * Skip write-flooding detected for the sp whose level is 1, because
3562 * it can become unsync, then the guest page is not write-protected.
3563 */
3564 if (sp->role.level == 1)
3565 return false;
3523 3566
3524 if (spte 3567 return ++sp->write_flooding_count >= 3;
3525 && vcpu->arch.last_pte_gfn == gfn 3568}
3526 && shadow_accessed_mask 3569
3527 && !(*spte & shadow_accessed_mask) 3570/*
3528 && is_shadow_present_pte(*spte)) 3571 * Misaligned accesses are too much trouble to fix up; also, they usually
3529 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 3572 * indicate a page is not used as a page table.
3573 */
3574static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
3575 int bytes)
3576{
3577 unsigned offset, pte_size, misaligned;
3578
3579 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3580 gpa, bytes, sp->role.word);
3581
3582 offset = offset_in_page(gpa);
3583 pte_size = sp->role.cr4_pae ? 8 : 4;
3584
3585 /*
3586 * Sometimes, the OS only writes the last one bytes to update status
3587 * bits, for example, in linux, andb instruction is used in clear_bit().
3588 */
3589 if (!(offset & (pte_size - 1)) && bytes == 1)
3590 return false;
3591
3592 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3593 misaligned |= bytes < 4;
3594
3595 return misaligned;
3596}
3597
3598static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
3599{
3600 unsigned page_offset, quadrant;
3601 u64 *spte;
3602 int level;
3603
3604 page_offset = offset_in_page(gpa);
3605 level = sp->role.level;
3606 *nspte = 1;
3607 if (!sp->role.cr4_pae) {
3608 page_offset <<= 1; /* 32->64 */
3609 /*
3610 * A 32-bit pde maps 4MB while the shadow pdes map
3611 * only 2MB. So we need to double the offset again
3612 * and zap two pdes instead of one.
3613 */
3614 if (level == PT32_ROOT_LEVEL) {
3615 page_offset &= ~7; /* kill rounding error */
3616 page_offset <<= 1;
3617 *nspte = 2;
3618 }
3619 quadrant = page_offset >> PAGE_SHIFT;
3620 page_offset &= ~PAGE_MASK;
3621 if (quadrant != sp->role.quadrant)
3622 return NULL;
3623 }
3624
3625 spte = &sp->spt[page_offset / sizeof(*spte)];
3626 return spte;
3530} 3627}
3531 3628
3532void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 3629void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3533 const u8 *new, int bytes, 3630 const u8 *new, int bytes)
3534 bool guest_initiated)
3535{ 3631{
3536 gfn_t gfn = gpa >> PAGE_SHIFT; 3632 gfn_t gfn = gpa >> PAGE_SHIFT;
3537 union kvm_mmu_page_role mask = { .word = 0 }; 3633 union kvm_mmu_page_role mask = { .word = 0 };
@@ -3539,8 +3635,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3539 struct hlist_node *node; 3635 struct hlist_node *node;
3540 LIST_HEAD(invalid_list); 3636 LIST_HEAD(invalid_list);
3541 u64 entry, gentry, *spte; 3637 u64 entry, gentry, *spte;
3542 unsigned pte_size, page_offset, misaligned, quadrant, offset; 3638 int npte;
3543 int level, npte, invlpg_counter, r, flooded = 0;
3544 bool remote_flush, local_flush, zap_page; 3639 bool remote_flush, local_flush, zap_page;
3545 3640
3546 /* 3641 /*
@@ -3551,112 +3646,45 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3551 return; 3646 return;
3552 3647
3553 zap_page = remote_flush = local_flush = false; 3648 zap_page = remote_flush = local_flush = false;
3554 offset = offset_in_page(gpa);
3555 3649
3556 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3650 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3557 3651
3558 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); 3652 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
3559 3653
3560 /* 3654 /*
3561 * Assume that the pte write on a page table of the same type 3655 * No need to care whether allocation memory is successful
3562 * as the current vcpu paging mode since we update the sptes only 3656 * or not since pte prefetch is skiped if it does not have
3563 * when they have the same mode. 3657 * enough objects in the cache.
3564 */ 3658 */
3565 if ((is_pae(vcpu) && bytes == 4) || !new) { 3659 mmu_topup_memory_caches(vcpu);
3566 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3567 if (is_pae(vcpu)) {
3568 gpa &= ~(gpa_t)7;
3569 bytes = 8;
3570 }
3571 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
3572 if (r)
3573 gentry = 0;
3574 new = (const u8 *)&gentry;
3575 }
3576
3577 switch (bytes) {
3578 case 4:
3579 gentry = *(const u32 *)new;
3580 break;
3581 case 8:
3582 gentry = *(const u64 *)new;
3583 break;
3584 default:
3585 gentry = 0;
3586 break;
3587 }
3588 3660
3589 spin_lock(&vcpu->kvm->mmu_lock); 3661 spin_lock(&vcpu->kvm->mmu_lock);
3590 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3591 gentry = 0;
3592 kvm_mmu_free_some_pages(vcpu);
3593 ++vcpu->kvm->stat.mmu_pte_write; 3662 ++vcpu->kvm->stat.mmu_pte_write;
3594 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 3663 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3595 if (guest_initiated) {
3596 kvm_mmu_access_page(vcpu, gfn);
3597 if (gfn == vcpu->arch.last_pt_write_gfn
3598 && !last_updated_pte_accessed(vcpu)) {
3599 ++vcpu->arch.last_pt_write_count;
3600 if (vcpu->arch.last_pt_write_count >= 3)
3601 flooded = 1;
3602 } else {
3603 vcpu->arch.last_pt_write_gfn = gfn;
3604 vcpu->arch.last_pt_write_count = 1;
3605 vcpu->arch.last_pte_updated = NULL;
3606 }
3607 }
3608 3664
3609 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; 3665 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3610 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { 3666 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3611 pte_size = sp->role.cr4_pae ? 8 : 4; 3667 spte = get_written_sptes(sp, gpa, &npte);
3612 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 3668
3613 misaligned |= bytes < 4; 3669 if (detect_write_misaligned(sp, gpa, bytes) ||
3614 if (misaligned || flooded) { 3670 detect_write_flooding(sp, spte)) {
3615 /*
3616 * Misaligned accesses are too much trouble to fix
3617 * up; also, they usually indicate a page is not used
3618 * as a page table.
3619 *
3620 * If we're seeing too many writes to a page,
3621 * it may no longer be a page table, or we may be
3622 * forking, in which case it is better to unmap the
3623 * page.
3624 */
3625 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3626 gpa, bytes, sp->role.word);
3627 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3671 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3628 &invalid_list); 3672 &invalid_list);
3629 ++vcpu->kvm->stat.mmu_flooded; 3673 ++vcpu->kvm->stat.mmu_flooded;
3630 continue; 3674 continue;
3631 } 3675 }
3632 page_offset = offset; 3676
3633 level = sp->role.level; 3677 spte = get_written_sptes(sp, gpa, &npte);
3634 npte = 1; 3678 if (!spte)
3635 if (!sp->role.cr4_pae) { 3679 continue;
3636 page_offset <<= 1; /* 32->64 */ 3680
3637 /*
3638 * A 32-bit pde maps 4MB while the shadow pdes map
3639 * only 2MB. So we need to double the offset again
3640 * and zap two pdes instead of one.
3641 */
3642 if (level == PT32_ROOT_LEVEL) {
3643 page_offset &= ~7; /* kill rounding error */
3644 page_offset <<= 1;
3645 npte = 2;
3646 }
3647 quadrant = page_offset >> PAGE_SHIFT;
3648 page_offset &= ~PAGE_MASK;
3649 if (quadrant != sp->role.quadrant)
3650 continue;
3651 }
3652 local_flush = true; 3681 local_flush = true;
3653 spte = &sp->spt[page_offset / sizeof(*spte)];
3654 while (npte--) { 3682 while (npte--) {
3655 entry = *spte; 3683 entry = *spte;
3656 mmu_page_zap_pte(vcpu->kvm, sp, spte); 3684 mmu_page_zap_pte(vcpu->kvm, sp, spte);
3657 if (gentry && 3685 if (gentry &&
3658 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3686 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3659 & mask.word)) 3687 & mask.word) && rmap_can_add(vcpu))
3660 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3688 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3661 if (!remote_flush && need_remote_flush(entry, *spte)) 3689 if (!remote_flush && need_remote_flush(entry, *spte))
3662 remote_flush = true; 3690 remote_flush = true;
@@ -3665,7 +3693,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3665 } 3693 }
3666 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3694 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
3667 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3695 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3668 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 3696 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3669 spin_unlock(&vcpu->kvm->mmu_lock); 3697 spin_unlock(&vcpu->kvm->mmu_lock);
3670} 3698}
3671 3699
@@ -3679,9 +3707,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
3679 3707
3680 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3708 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
3681 3709
3682 spin_lock(&vcpu->kvm->mmu_lock);
3683 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3710 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3684 spin_unlock(&vcpu->kvm->mmu_lock); 3711
3685 return r; 3712 return r;
3686} 3713}
3687EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 3714EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
@@ -3702,10 +3729,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3702 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3729 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3703} 3730}
3704 3731
3732static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
3733{
3734 if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
3735 return vcpu_match_mmio_gpa(vcpu, addr);
3736
3737 return vcpu_match_mmio_gva(vcpu, addr);
3738}
3739
3705int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, 3740int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3706 void *insn, int insn_len) 3741 void *insn, int insn_len)
3707{ 3742{
3708 int r; 3743 int r, emulation_type = EMULTYPE_RETRY;
3709 enum emulation_result er; 3744 enum emulation_result er;
3710 3745
3711 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); 3746 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
@@ -3717,11 +3752,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3717 goto out; 3752 goto out;
3718 } 3753 }
3719 3754
3720 r = mmu_topup_memory_caches(vcpu); 3755 if (is_mmio_page_fault(vcpu, cr2))
3721 if (r) 3756 emulation_type = 0;
3722 goto out;
3723 3757
3724 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); 3758 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
3725 3759
3726 switch (er) { 3760 switch (er) {
3727 case EMULATE_DONE: 3761 case EMULATE_DONE:
@@ -3792,7 +3826,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
3792int kvm_mmu_create(struct kvm_vcpu *vcpu) 3826int kvm_mmu_create(struct kvm_vcpu *vcpu)
3793{ 3827{
3794 ASSERT(vcpu); 3828 ASSERT(vcpu);
3795 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3829
3830 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
3831 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3832 vcpu->arch.mmu.translate_gpa = translate_gpa;
3833 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
3796 3834
3797 return alloc_mmu_pages(vcpu); 3835 return alloc_mmu_pages(vcpu);
3798} 3836}
@@ -3852,14 +3890,14 @@ restart:
3852 spin_unlock(&kvm->mmu_lock); 3890 spin_unlock(&kvm->mmu_lock);
3853} 3891}
3854 3892
3855static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, 3893static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3856 struct list_head *invalid_list) 3894 struct list_head *invalid_list)
3857{ 3895{
3858 struct kvm_mmu_page *page; 3896 struct kvm_mmu_page *page;
3859 3897
3860 page = container_of(kvm->arch.active_mmu_pages.prev, 3898 page = container_of(kvm->arch.active_mmu_pages.prev,
3861 struct kvm_mmu_page, link); 3899 struct kvm_mmu_page, link);
3862 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3900 kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3863} 3901}
3864 3902
3865static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 3903static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -3874,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3874 raw_spin_lock(&kvm_lock); 3912 raw_spin_lock(&kvm_lock);
3875 3913
3876 list_for_each_entry(kvm, &vm_list, vm_list) { 3914 list_for_each_entry(kvm, &vm_list, vm_list) {
3877 int idx, freed_pages; 3915 int idx;
3878 LIST_HEAD(invalid_list); 3916 LIST_HEAD(invalid_list);
3879 3917
3880 idx = srcu_read_lock(&kvm->srcu); 3918 idx = srcu_read_lock(&kvm->srcu);
3881 spin_lock(&kvm->mmu_lock); 3919 spin_lock(&kvm->mmu_lock);
3882 if (!kvm_freed && nr_to_scan > 0 && 3920 if (!kvm_freed && nr_to_scan > 0 &&
3883 kvm->arch.n_used_mmu_pages > 0) { 3921 kvm->arch.n_used_mmu_pages > 0) {
3884 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3922 kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3885 &invalid_list); 3923 &invalid_list);
3886 kvm_freed = kvm; 3924 kvm_freed = kvm;
3887 } 3925 }
3888 nr_to_scan--; 3926 nr_to_scan--;
@@ -3944,15 +3982,15 @@ nomem:
3944 */ 3982 */
3945unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) 3983unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3946{ 3984{
3947 int i;
3948 unsigned int nr_mmu_pages; 3985 unsigned int nr_mmu_pages;
3949 unsigned int nr_pages = 0; 3986 unsigned int nr_pages = 0;
3950 struct kvm_memslots *slots; 3987 struct kvm_memslots *slots;
3988 struct kvm_memory_slot *memslot;
3951 3989
3952 slots = kvm_memslots(kvm); 3990 slots = kvm_memslots(kvm);
3953 3991
3954 for (i = 0; i < slots->nmemslots; i++) 3992 kvm_for_each_memslot(memslot, slots)
3955 nr_pages += slots->memslots[i].npages; 3993 nr_pages += memslot->npages;
3956 3994
3957 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3995 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3958 nr_mmu_pages = max(nr_mmu_pages, 3996 nr_mmu_pages = max(nr_mmu_pages,
@@ -3961,127 +3999,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3961 return nr_mmu_pages; 3999 return nr_mmu_pages;
3962} 4000}
3963 4001
3964static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3965 unsigned len)
3966{
3967 if (len > buffer->len)
3968 return NULL;
3969 return buffer->ptr;
3970}
3971
3972static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3973 unsigned len)
3974{
3975 void *ret;
3976
3977 ret = pv_mmu_peek_buffer(buffer, len);
3978 if (!ret)
3979 return ret;
3980 buffer->ptr += len;
3981 buffer->len -= len;
3982 buffer->processed += len;
3983 return ret;
3984}
3985
3986static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3987 gpa_t addr, gpa_t value)
3988{
3989 int bytes = 8;
3990 int r;
3991
3992 if (!is_long_mode(vcpu) && !is_pae(vcpu))
3993 bytes = 4;
3994
3995 r = mmu_topup_memory_caches(vcpu);
3996 if (r)
3997 return r;
3998
3999 if (!emulator_write_phys(vcpu, addr, &value, bytes))
4000 return -EFAULT;
4001
4002 return 1;
4003}
4004
4005static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
4006{
4007 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
4008 return 1;
4009}
4010
4011static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
4012{
4013 spin_lock(&vcpu->kvm->mmu_lock);
4014 mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
4015 spin_unlock(&vcpu->kvm->mmu_lock);
4016 return 1;
4017}
4018
4019static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
4020 struct kvm_pv_mmu_op_buffer *buffer)
4021{
4022 struct kvm_mmu_op_header *header;
4023
4024 header = pv_mmu_peek_buffer(buffer, sizeof *header);
4025 if (!header)
4026 return 0;
4027 switch (header->op) {
4028 case KVM_MMU_OP_WRITE_PTE: {
4029 struct kvm_mmu_op_write_pte *wpte;
4030
4031 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
4032 if (!wpte)
4033 return 0;
4034 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
4035 wpte->pte_val);
4036 }
4037 case KVM_MMU_OP_FLUSH_TLB: {
4038 struct kvm_mmu_op_flush_tlb *ftlb;
4039
4040 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
4041 if (!ftlb)
4042 return 0;
4043 return kvm_pv_mmu_flush_tlb(vcpu);
4044 }
4045 case KVM_MMU_OP_RELEASE_PT: {
4046 struct kvm_mmu_op_release_pt *rpt;
4047
4048 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
4049 if (!rpt)
4050 return 0;
4051 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
4052 }
4053 default: return 0;
4054 }
4055}
4056
4057int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
4058 gpa_t addr, unsigned long *ret)
4059{
4060 int r;
4061 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
4062
4063 buffer->ptr = buffer->buf;
4064 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
4065 buffer->processed = 0;
4066
4067 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
4068 if (r)
4069 goto out;
4070
4071 while (buffer->len) {
4072 r = kvm_pv_mmu_op_one(vcpu, buffer);
4073 if (r < 0)
4074 goto out;
4075 if (r == 0)
4076 break;
4077 }
4078
4079 r = 1;
4080out:
4081 *ret = buffer->processed;
4082 return r;
4083}
4084
4085int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) 4002int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
4086{ 4003{
4087 struct kvm_shadow_walk_iterator iterator; 4004 struct kvm_shadow_walk_iterator iterator;
@@ -4110,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
4110 mmu_free_memory_caches(vcpu); 4027 mmu_free_memory_caches(vcpu);
4111} 4028}
4112 4029
4113#ifdef CONFIG_KVM_MMU_AUDIT
4114#include "mmu_audit.c"
4115#else
4116static void mmu_audit_disable(void) { }
4117#endif
4118
4119void kvm_mmu_module_exit(void) 4030void kvm_mmu_module_exit(void)
4120{ 4031{
4121 mmu_destroy_caches(); 4032 mmu_destroy_caches();
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 746ec259d024..fe15dcc07a6b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,6 +19,15 @@
19 19
20#include <linux/ratelimit.h> 20#include <linux/ratelimit.h>
21 21
22char const *audit_point_name[] = {
23 "pre page fault",
24 "post page fault",
25 "pre pte write",
26 "post pte write",
27 "pre sync",
28 "post sync"
29};
30
22#define audit_printk(kvm, fmt, args...) \ 31#define audit_printk(kvm, fmt, args...) \
23 printk(KERN_ERR "audit: (%s) error: " \ 32 printk(KERN_ERR "audit: (%s) error: " \
24 fmt, audit_point_name[kvm->arch.audit_point], ##args) 33 fmt, audit_point_name[kvm->arch.audit_point], ##args)
@@ -224,7 +233,10 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
224 mmu_spte_walk(vcpu, audit_spte); 233 mmu_spte_walk(vcpu, audit_spte);
225} 234}
226 235
227static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) 236static bool mmu_audit;
237static struct jump_label_key mmu_audit_key;
238
239static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
228{ 240{
229 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); 241 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
230 242
@@ -236,18 +248,18 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
236 audit_vcpu_spte(vcpu); 248 audit_vcpu_spte(vcpu);
237} 249}
238 250
239static bool mmu_audit; 251static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
252{
253 if (static_branch((&mmu_audit_key)))
254 __kvm_mmu_audit(vcpu, point);
255}
240 256
241static void mmu_audit_enable(void) 257static void mmu_audit_enable(void)
242{ 258{
243 int ret;
244
245 if (mmu_audit) 259 if (mmu_audit)
246 return; 260 return;
247 261
248 ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); 262 jump_label_inc(&mmu_audit_key);
249 WARN_ON(ret);
250
251 mmu_audit = true; 263 mmu_audit = true;
252} 264}
253 265
@@ -256,8 +268,7 @@ static void mmu_audit_disable(void)
256 if (!mmu_audit) 268 if (!mmu_audit)
257 return; 269 return;
258 270
259 unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); 271 jump_label_dec(&mmu_audit_key);
260 tracepoint_synchronize_unregister();
261 mmu_audit = false; 272 mmu_audit = false;
262} 273}
263 274
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index eed67f34146d..89fb0e81322a 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,25 +243,6 @@ TRACE_EVENT(
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, 243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access) 244 __entry->access)
245); 245);
246
247TRACE_EVENT(
248 kvm_mmu_audit,
249 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
250 TP_ARGS(vcpu, audit_point),
251
252 TP_STRUCT__entry(
253 __field(struct kvm_vcpu *, vcpu)
254 __field(int, audit_point)
255 ),
256
257 TP_fast_assign(
258 __entry->vcpu = vcpu;
259 __entry->audit_point = audit_point;
260 ),
261
262 TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
263 audit_point_name[__entry->audit_point])
264);
265#endif /* _TRACE_KVMMMU_H */ 246#endif /* _TRACE_KVMMMU_H */
266 247
267#undef TRACE_INCLUDE_PATH 248#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 92994100638b..15610285ebb6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -497,6 +497,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
497 shadow_walk_next(&it)) { 497 shadow_walk_next(&it)) {
498 gfn_t table_gfn; 498 gfn_t table_gfn;
499 499
500 clear_sp_write_flooding_count(it.sptep);
500 drop_large_spte(vcpu, it.sptep); 501 drop_large_spte(vcpu, it.sptep);
501 502
502 sp = NULL; 503 sp = NULL;
@@ -522,6 +523,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
522 shadow_walk_next(&it)) { 523 shadow_walk_next(&it)) {
523 gfn_t direct_gfn; 524 gfn_t direct_gfn;
524 525
526 clear_sp_write_flooding_count(it.sptep);
525 validate_direct_spte(vcpu, it.sptep, direct_access); 527 validate_direct_spte(vcpu, it.sptep, direct_access);
526 528
527 drop_large_spte(vcpu, it.sptep); 529 drop_large_spte(vcpu, it.sptep);
@@ -536,6 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
536 link_shadow_page(it.sptep, sp); 538 link_shadow_page(it.sptep, sp);
537 } 539 }
538 540
541 clear_sp_write_flooding_count(it.sptep);
539 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 542 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
540 user_fault, write_fault, emulate, it.level, 543 user_fault, write_fault, emulate, it.level,
541 gw->gfn, pfn, prefault, map_writable); 544 gw->gfn, pfn, prefault, map_writable);
@@ -599,11 +602,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
599 */ 602 */
600 if (!r) { 603 if (!r) {
601 pgprintk("%s: guest page fault\n", __func__); 604 pgprintk("%s: guest page fault\n", __func__);
602 if (!prefault) { 605 if (!prefault)
603 inject_page_fault(vcpu, &walker.fault); 606 inject_page_fault(vcpu, &walker.fault);
604 /* reset fork detector */ 607
605 vcpu->arch.last_pt_write_count = 0;
606 }
607 return 0; 608 return 0;
608 } 609 }
609 610
@@ -631,7 +632,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
631 if (mmu_notifier_retry(vcpu, mmu_seq)) 632 if (mmu_notifier_retry(vcpu, mmu_seq))
632 goto out_unlock; 633 goto out_unlock;
633 634
634 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 635 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
635 kvm_mmu_free_some_pages(vcpu); 636 kvm_mmu_free_some_pages(vcpu);
636 if (!force_pt_level) 637 if (!force_pt_level)
637 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 638 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
@@ -641,11 +642,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
641 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, 642 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
642 sptep, *sptep, emulate); 643 sptep, *sptep, emulate);
643 644
644 if (!emulate)
645 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
646
647 ++vcpu->stat.pf_fixed; 645 ++vcpu->stat.pf_fixed;
648 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 646 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
649 spin_unlock(&vcpu->kvm->mmu_lock); 647 spin_unlock(&vcpu->kvm->mmu_lock);
650 648
651 return emulate; 649 return emulate;
@@ -656,65 +654,66 @@ out_unlock:
656 return 0; 654 return 0;
657} 655}
658 656
657static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
658{
659 int offset = 0;
660
661 WARN_ON(sp->role.level != 1);
662
663 if (PTTYPE == 32)
664 offset = sp->role.quadrant << PT64_LEVEL_BITS;
665
666 return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
667}
668
659static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 669static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
660{ 670{
661 struct kvm_shadow_walk_iterator iterator; 671 struct kvm_shadow_walk_iterator iterator;
662 struct kvm_mmu_page *sp; 672 struct kvm_mmu_page *sp;
663 gpa_t pte_gpa = -1;
664 int level; 673 int level;
665 u64 *sptep; 674 u64 *sptep;
666 int need_flush = 0;
667 675
668 vcpu_clear_mmio_info(vcpu, gva); 676 vcpu_clear_mmio_info(vcpu, gva);
669 677
670 spin_lock(&vcpu->kvm->mmu_lock); 678 /*
679 * No need to check return value here, rmap_can_add() can
680 * help us to skip pte prefetch later.
681 */
682 mmu_topup_memory_caches(vcpu);
671 683
684 spin_lock(&vcpu->kvm->mmu_lock);
672 for_each_shadow_entry(vcpu, gva, iterator) { 685 for_each_shadow_entry(vcpu, gva, iterator) {
673 level = iterator.level; 686 level = iterator.level;
674 sptep = iterator.sptep; 687 sptep = iterator.sptep;
675 688
676 sp = page_header(__pa(sptep)); 689 sp = page_header(__pa(sptep));
677 if (is_last_spte(*sptep, level)) { 690 if (is_last_spte(*sptep, level)) {
678 int offset, shift; 691 pt_element_t gpte;
692 gpa_t pte_gpa;
679 693
680 if (!sp->unsync) 694 if (!sp->unsync)
681 break; 695 break;
682 696
683 shift = PAGE_SHIFT - 697 pte_gpa = FNAME(get_level1_sp_gpa)(sp);
684 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
685 offset = sp->role.quadrant << shift;
686
687 pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
688 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 698 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
689 699
690 if (is_shadow_present_pte(*sptep)) { 700 if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
691 if (is_large_pte(*sptep)) 701 kvm_flush_remote_tlbs(vcpu->kvm);
692 --vcpu->kvm->stat.lpages;
693 drop_spte(vcpu->kvm, sptep);
694 need_flush = 1;
695 } else if (is_mmio_spte(*sptep))
696 mmu_spte_clear_no_track(sptep);
697 702
698 break; 703 if (!rmap_can_add(vcpu))
704 break;
705
706 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
707 sizeof(pt_element_t)))
708 break;
709
710 FNAME(update_pte)(vcpu, sp, sptep, &gpte);
699 } 711 }
700 712
701 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) 713 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
702 break; 714 break;
703 } 715 }
704
705 if (need_flush)
706 kvm_flush_remote_tlbs(vcpu->kvm);
707
708 atomic_inc(&vcpu->kvm->arch.invlpg_counter);
709
710 spin_unlock(&vcpu->kvm->mmu_lock); 716 spin_unlock(&vcpu->kvm->mmu_lock);
711
712 if (pte_gpa == -1)
713 return;
714
715 if (mmu_topup_memory_caches(vcpu))
716 return;
717 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
718} 717}
719 718
720static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 719static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -769,19 +768,14 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
769 */ 768 */
770static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 769static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
771{ 770{
772 int i, offset, nr_present; 771 int i, nr_present = 0;
773 bool host_writable; 772 bool host_writable;
774 gpa_t first_pte_gpa; 773 gpa_t first_pte_gpa;
775 774
776 offset = nr_present = 0;
777
778 /* direct kvm_mmu_page can not be unsync. */ 775 /* direct kvm_mmu_page can not be unsync. */
779 BUG_ON(sp->role.direct); 776 BUG_ON(sp->role.direct);
780 777
781 if (PTTYPE == 32) 778 first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
782 offset = sp->role.quadrant << PT64_LEVEL_BITS;
783
784 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
785 779
786 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 780 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
787 unsigned pte_access; 781 unsigned pte_access;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000000..7aad5446f393
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,533 @@
1/*
2 * Kernel-based Virtual Machine -- Performane Monitoring Unit support
3 *
4 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
5 *
6 * Authors:
7 * Avi Kivity <avi@redhat.com>
8 * Gleb Natapov <gleb@redhat.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
12 *
13 */
14
15#include <linux/types.h>
16#include <linux/kvm_host.h>
17#include <linux/perf_event.h>
18#include "x86.h"
19#include "cpuid.h"
20#include "lapic.h"
21
22static struct kvm_arch_event_perf_mapping {
23 u8 eventsel;
24 u8 unit_mask;
25 unsigned event_type;
26 bool inexact;
27} arch_events[] = {
28 /* Index must match CPUID 0x0A.EBX bit vector */
29 [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
30 [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
31 [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
32 [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
33 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
34 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
35 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
36};
37
38/* mapping between fixed pmc index and arch_events array */
39int fixed_pmc_events[] = {1, 0, 2};
40
41static bool pmc_is_gp(struct kvm_pmc *pmc)
42{
43 return pmc->type == KVM_PMC_GP;
44}
45
46static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
47{
48 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
49
50 return pmu->counter_bitmask[pmc->type];
51}
52
53static inline bool pmc_enabled(struct kvm_pmc *pmc)
54{
55 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
56 return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
57}
58
59static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
60 u32 base)
61{
62 if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
63 return &pmu->gp_counters[msr - base];
64 return NULL;
65}
66
67static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
68{
69 int base = MSR_CORE_PERF_FIXED_CTR0;
70 if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
71 return &pmu->fixed_counters[msr - base];
72 return NULL;
73}
74
75static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
76{
77 return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
78}
79
80static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
81{
82 if (idx < X86_PMC_IDX_FIXED)
83 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
84 else
85 return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
86}
87
88void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
89{
90 if (vcpu->arch.apic)
91 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
92}
93
94static void trigger_pmi(struct irq_work *irq_work)
95{
96 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
97 irq_work);
98 struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
99 arch.pmu);
100
101 kvm_deliver_pmi(vcpu);
102}
103
104static void kvm_perf_overflow(struct perf_event *perf_event,
105 struct perf_sample_data *data,
106 struct pt_regs *regs)
107{
108 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
109 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
110 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
111}
112
113static void kvm_perf_overflow_intr(struct perf_event *perf_event,
114 struct perf_sample_data *data, struct pt_regs *regs)
115{
116 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
117 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
118 if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
119 kvm_perf_overflow(perf_event, data, regs);
120 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
121 /*
122 * Inject PMI. If vcpu was in a guest mode during NMI PMI
123 * can be ejected on a guest mode re-entry. Otherwise we can't
124 * be sure that vcpu wasn't executing hlt instruction at the
125 * time of vmexit and is not going to re-enter guest mode until,
126 * woken up. So we should wake it, but this is impossible from
127 * NMI context. Do it from irq work instead.
128 */
129 if (!kvm_is_in_guest())
130 irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
131 else
132 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
133 }
134}
135
136static u64 read_pmc(struct kvm_pmc *pmc)
137{
138 u64 counter, enabled, running;
139
140 counter = pmc->counter;
141
142 if (pmc->perf_event)
143 counter += perf_event_read_value(pmc->perf_event,
144 &enabled, &running);
145
146 /* FIXME: Scaling needed? */
147
148 return counter & pmc_bitmask(pmc);
149}
150
151static void stop_counter(struct kvm_pmc *pmc)
152{
153 if (pmc->perf_event) {
154 pmc->counter = read_pmc(pmc);
155 perf_event_release_kernel(pmc->perf_event);
156 pmc->perf_event = NULL;
157 }
158}
159
160static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
161 unsigned config, bool exclude_user, bool exclude_kernel,
162 bool intr)
163{
164 struct perf_event *event;
165 struct perf_event_attr attr = {
166 .type = type,
167 .size = sizeof(attr),
168 .pinned = true,
169 .exclude_idle = true,
170 .exclude_host = 1,
171 .exclude_user = exclude_user,
172 .exclude_kernel = exclude_kernel,
173 .config = config,
174 };
175
176 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
177
178 event = perf_event_create_kernel_counter(&attr, -1, current,
179 intr ? kvm_perf_overflow_intr :
180 kvm_perf_overflow, pmc);
181 if (IS_ERR(event)) {
182 printk_once("kvm: pmu event creation failed %ld\n",
183 PTR_ERR(event));
184 return;
185 }
186
187 pmc->perf_event = event;
188 clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
189}
190
191static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
192 u8 unit_mask)
193{
194 int i;
195
196 for (i = 0; i < ARRAY_SIZE(arch_events); i++)
197 if (arch_events[i].eventsel == event_select
198 && arch_events[i].unit_mask == unit_mask
199 && (pmu->available_event_types & (1 << i)))
200 break;
201
202 if (i == ARRAY_SIZE(arch_events))
203 return PERF_COUNT_HW_MAX;
204
205 return arch_events[i].event_type;
206}
207
208static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
209{
210 unsigned config, type = PERF_TYPE_RAW;
211 u8 event_select, unit_mask;
212
213 pmc->eventsel = eventsel;
214
215 stop_counter(pmc);
216
217 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
218 return;
219
220 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
221 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
222
223 if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
224 ARCH_PERFMON_EVENTSEL_INV |
225 ARCH_PERFMON_EVENTSEL_CMASK))) {
226 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
227 unit_mask);
228 if (config != PERF_COUNT_HW_MAX)
229 type = PERF_TYPE_HARDWARE;
230 }
231
232 if (type == PERF_TYPE_RAW)
233 config = eventsel & X86_RAW_EVENT_MASK;
234
235 reprogram_counter(pmc, type, config,
236 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
237 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
238 eventsel & ARCH_PERFMON_EVENTSEL_INT);
239}
240
241static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
242{
243 unsigned en = en_pmi & 0x3;
244 bool pmi = en_pmi & 0x8;
245
246 stop_counter(pmc);
247
248 if (!en || !pmc_enabled(pmc))
249 return;
250
251 reprogram_counter(pmc, PERF_TYPE_HARDWARE,
252 arch_events[fixed_pmc_events[idx]].event_type,
253 !(en & 0x2), /* exclude user */
254 !(en & 0x1), /* exclude kernel */
255 pmi);
256}
257
258static inline u8 fixed_en_pmi(u64 ctrl, int idx)
259{
260 return (ctrl >> (idx * 4)) & 0xf;
261}
262
263static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
264{
265 int i;
266
267 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
268 u8 en_pmi = fixed_en_pmi(data, i);
269 struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
270
271 if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
272 continue;
273
274 reprogram_fixed_counter(pmc, en_pmi, i);
275 }
276
277 pmu->fixed_ctr_ctrl = data;
278}
279
280static void reprogram_idx(struct kvm_pmu *pmu, int idx)
281{
282 struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
283
284 if (!pmc)
285 return;
286
287 if (pmc_is_gp(pmc))
288 reprogram_gp_counter(pmc, pmc->eventsel);
289 else {
290 int fidx = idx - X86_PMC_IDX_FIXED;
291 reprogram_fixed_counter(pmc,
292 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
293 }
294}
295
296static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
297{
298 int bit;
299 u64 diff = pmu->global_ctrl ^ data;
300
301 pmu->global_ctrl = data;
302
303 for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
304 reprogram_idx(pmu, bit);
305}
306
307bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
308{
309 struct kvm_pmu *pmu = &vcpu->arch.pmu;
310 int ret;
311
312 switch (msr) {
313 case MSR_CORE_PERF_FIXED_CTR_CTRL:
314 case MSR_CORE_PERF_GLOBAL_STATUS:
315 case MSR_CORE_PERF_GLOBAL_CTRL:
316 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
317 ret = pmu->version > 1;
318 break;
319 default:
320 ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
321 || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
322 || get_fixed_pmc(pmu, msr);
323 break;
324 }
325 return ret;
326}
327
328int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
329{
330 struct kvm_pmu *pmu = &vcpu->arch.pmu;
331 struct kvm_pmc *pmc;
332
333 switch (index) {
334 case MSR_CORE_PERF_FIXED_CTR_CTRL:
335 *data = pmu->fixed_ctr_ctrl;
336 return 0;
337 case MSR_CORE_PERF_GLOBAL_STATUS:
338 *data = pmu->global_status;
339 return 0;
340 case MSR_CORE_PERF_GLOBAL_CTRL:
341 *data = pmu->global_ctrl;
342 return 0;
343 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
344 *data = pmu->global_ovf_ctrl;
345 return 0;
346 default:
347 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
348 (pmc = get_fixed_pmc(pmu, index))) {
349 *data = read_pmc(pmc);
350 return 0;
351 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
352 *data = pmc->eventsel;
353 return 0;
354 }
355 }
356 return 1;
357}
358
359int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
360{
361 struct kvm_pmu *pmu = &vcpu->arch.pmu;
362 struct kvm_pmc *pmc;
363
364 switch (index) {
365 case MSR_CORE_PERF_FIXED_CTR_CTRL:
366 if (pmu->fixed_ctr_ctrl == data)
367 return 0;
368 if (!(data & 0xfffffffffffff444)) {
369 reprogram_fixed_counters(pmu, data);
370 return 0;
371 }
372 break;
373 case MSR_CORE_PERF_GLOBAL_STATUS:
374 break; /* RO MSR */
375 case MSR_CORE_PERF_GLOBAL_CTRL:
376 if (pmu->global_ctrl == data)
377 return 0;
378 if (!(data & pmu->global_ctrl_mask)) {
379 global_ctrl_changed(pmu, data);
380 return 0;
381 }
382 break;
383 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
384 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
385 pmu->global_status &= ~data;
386 pmu->global_ovf_ctrl = data;
387 return 0;
388 }
389 break;
390 default:
391 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
392 (pmc = get_fixed_pmc(pmu, index))) {
393 data = (s64)(s32)data;
394 pmc->counter += data - read_pmc(pmc);
395 return 0;
396 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
397 if (data == pmc->eventsel)
398 return 0;
399 if (!(data & 0xffffffff00200000ull)) {
400 reprogram_gp_counter(pmc, data);
401 return 0;
402 }
403 }
404 }
405 return 1;
406}
407
408int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
409{
410 struct kvm_pmu *pmu = &vcpu->arch.pmu;
411 bool fast_mode = pmc & (1u << 31);
412 bool fixed = pmc & (1u << 30);
413 struct kvm_pmc *counters;
414 u64 ctr;
415
416 pmc &= (3u << 30) - 1;
417 if (!fixed && pmc >= pmu->nr_arch_gp_counters)
418 return 1;
419 if (fixed && pmc >= pmu->nr_arch_fixed_counters)
420 return 1;
421 counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
422 ctr = read_pmc(&counters[pmc]);
423 if (fast_mode)
424 ctr = (u32)ctr;
425 *data = ctr;
426
427 return 0;
428}
429
430void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
431{
432 struct kvm_pmu *pmu = &vcpu->arch.pmu;
433 struct kvm_cpuid_entry2 *entry;
434 unsigned bitmap_len;
435
436 pmu->nr_arch_gp_counters = 0;
437 pmu->nr_arch_fixed_counters = 0;
438 pmu->counter_bitmask[KVM_PMC_GP] = 0;
439 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
440 pmu->version = 0;
441
442 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
443 if (!entry)
444 return;
445
446 pmu->version = entry->eax & 0xff;
447 if (!pmu->version)
448 return;
449
450 pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
451 X86_PMC_MAX_GENERIC);
452 pmu->counter_bitmask[KVM_PMC_GP] =
453 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
454 bitmap_len = (entry->eax >> 24) & 0xff;
455 pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
456
457 if (pmu->version == 1) {
458 pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1;
459 return;
460 }
461
462 pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
463 X86_PMC_MAX_FIXED);
464 pmu->counter_bitmask[KVM_PMC_FIXED] =
465 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
466 pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1)
467 | (((1ull << pmu->nr_arch_fixed_counters) - 1)
468 << X86_PMC_IDX_FIXED));
469}
470
471void kvm_pmu_init(struct kvm_vcpu *vcpu)
472{
473 int i;
474 struct kvm_pmu *pmu = &vcpu->arch.pmu;
475
476 memset(pmu, 0, sizeof(*pmu));
477 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
478 pmu->gp_counters[i].type = KVM_PMC_GP;
479 pmu->gp_counters[i].vcpu = vcpu;
480 pmu->gp_counters[i].idx = i;
481 }
482 for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
483 pmu->fixed_counters[i].type = KVM_PMC_FIXED;
484 pmu->fixed_counters[i].vcpu = vcpu;
485 pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
486 }
487 init_irq_work(&pmu->irq_work, trigger_pmi);
488 kvm_pmu_cpuid_update(vcpu);
489}
490
491void kvm_pmu_reset(struct kvm_vcpu *vcpu)
492{
493 struct kvm_pmu *pmu = &vcpu->arch.pmu;
494 int i;
495
496 irq_work_sync(&pmu->irq_work);
497 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
498 struct kvm_pmc *pmc = &pmu->gp_counters[i];
499 stop_counter(pmc);
500 pmc->counter = pmc->eventsel = 0;
501 }
502
503 for (i = 0; i < X86_PMC_MAX_FIXED; i++)
504 stop_counter(&pmu->fixed_counters[i]);
505
506 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
507 pmu->global_ovf_ctrl = 0;
508}
509
510void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
511{
512 kvm_pmu_reset(vcpu);
513}
514
515void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
516{
517 struct kvm_pmu *pmu = &vcpu->arch.pmu;
518 u64 bitmask;
519 int bit;
520
521 bitmask = pmu->reprogram_pmi;
522
523 for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
524 struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
525
526 if (unlikely(!pmc || !pmc->perf_event)) {
527 clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
528 continue;
529 }
530
531 reprogram_idx(pmu, bit);
532 }
533}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e32243eac2f4..5fa553babe56 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1014,6 +1014,7 @@ static void init_vmcb(struct vcpu_svm *svm)
1014 set_intercept(svm, INTERCEPT_NMI); 1014 set_intercept(svm, INTERCEPT_NMI);
1015 set_intercept(svm, INTERCEPT_SMI); 1015 set_intercept(svm, INTERCEPT_SMI);
1016 set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1016 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1017 set_intercept(svm, INTERCEPT_RDPMC);
1017 set_intercept(svm, INTERCEPT_CPUID); 1018 set_intercept(svm, INTERCEPT_CPUID);
1018 set_intercept(svm, INTERCEPT_INVD); 1019 set_intercept(svm, INTERCEPT_INVD);
1019 set_intercept(svm, INTERCEPT_HLT); 1020 set_intercept(svm, INTERCEPT_HLT);
@@ -2770,6 +2771,19 @@ static int emulate_on_interception(struct vcpu_svm *svm)
2770 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 2771 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2771} 2772}
2772 2773
2774static int rdpmc_interception(struct vcpu_svm *svm)
2775{
2776 int err;
2777
2778 if (!static_cpu_has(X86_FEATURE_NRIPS))
2779 return emulate_on_interception(svm);
2780
2781 err = kvm_rdpmc(&svm->vcpu);
2782 kvm_complete_insn_gp(&svm->vcpu, err);
2783
2784 return 1;
2785}
2786
2773bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) 2787bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
2774{ 2788{
2775 unsigned long cr0 = svm->vcpu.arch.cr0; 2789 unsigned long cr0 = svm->vcpu.arch.cr0;
@@ -3190,6 +3204,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
3190 [SVM_EXIT_SMI] = nop_on_interception, 3204 [SVM_EXIT_SMI] = nop_on_interception,
3191 [SVM_EXIT_INIT] = nop_on_interception, 3205 [SVM_EXIT_INIT] = nop_on_interception,
3192 [SVM_EXIT_VINTR] = interrupt_window_interception, 3206 [SVM_EXIT_VINTR] = interrupt_window_interception,
3207 [SVM_EXIT_RDPMC] = rdpmc_interception,
3193 [SVM_EXIT_CPUID] = cpuid_interception, 3208 [SVM_EXIT_CPUID] = cpuid_interception,
3194 [SVM_EXIT_IRET] = iret_interception, 3209 [SVM_EXIT_IRET] = iret_interception,
3195 [SVM_EXIT_INVD] = emulate_on_interception, 3210 [SVM_EXIT_INVD] = emulate_on_interception,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index ae432ea1cd83..6b85cc647f34 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -18,9 +18,10 @@
18#include <linux/atomic.h> 18#include <linux/atomic.h>
19#include "kvm_timer.h" 19#include "kvm_timer.h"
20 20
21static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) 21enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
22{ 22{
23 int restart_timer = 0; 23 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
24 struct kvm_vcpu *vcpu = ktimer->vcpu;
24 wait_queue_head_t *q = &vcpu->wq; 25 wait_queue_head_t *q = &vcpu->wq;
25 26
26 /* 27 /*
@@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
40 41
41 if (ktimer->t_ops->is_periodic(ktimer)) { 42 if (ktimer->t_ops->is_periodic(ktimer)) {
42 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); 43 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
43 restart_timer = 1;
44 }
45
46 return restart_timer;
47}
48
49enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
50{
51 int restart_timer;
52 struct kvm_vcpu *vcpu;
53 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
54
55 vcpu = ktimer->vcpu;
56 if (!vcpu)
57 return HRTIMER_NORESTART;
58
59 restart_timer = __kvm_timer_fn(vcpu, ktimer);
60 if (restart_timer)
61 return HRTIMER_RESTART; 44 return HRTIMER_RESTART;
62 else 45 } else
63 return HRTIMER_NORESTART; 46 return HRTIMER_NORESTART;
64} 47}
65
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 579a0b51696a..d29216c462b3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -18,6 +18,7 @@
18 18
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "cpuid.h"
21 22
22#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
23#include <linux/module.h> 24#include <linux/module.h>
@@ -50,29 +51,29 @@
50MODULE_AUTHOR("Qumranet"); 51MODULE_AUTHOR("Qumranet");
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
52 53
53static int __read_mostly enable_vpid = 1; 54static bool __read_mostly enable_vpid = 1;
54module_param_named(vpid, enable_vpid, bool, 0444); 55module_param_named(vpid, enable_vpid, bool, 0444);
55 56
56static int __read_mostly flexpriority_enabled = 1; 57static bool __read_mostly flexpriority_enabled = 1;
57module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 58module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
58 59
59static int __read_mostly enable_ept = 1; 60static bool __read_mostly enable_ept = 1;
60module_param_named(ept, enable_ept, bool, S_IRUGO); 61module_param_named(ept, enable_ept, bool, S_IRUGO);
61 62
62static int __read_mostly enable_unrestricted_guest = 1; 63static bool __read_mostly enable_unrestricted_guest = 1;
63module_param_named(unrestricted_guest, 64module_param_named(unrestricted_guest,
64 enable_unrestricted_guest, bool, S_IRUGO); 65 enable_unrestricted_guest, bool, S_IRUGO);
65 66
66static int __read_mostly emulate_invalid_guest_state = 0; 67static bool __read_mostly emulate_invalid_guest_state = 0;
67module_param(emulate_invalid_guest_state, bool, S_IRUGO); 68module_param(emulate_invalid_guest_state, bool, S_IRUGO);
68 69
69static int __read_mostly vmm_exclusive = 1; 70static bool __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 71module_param(vmm_exclusive, bool, S_IRUGO);
71 72
72static int __read_mostly yield_on_hlt = 1; 73static bool __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO); 74module_param(yield_on_hlt, bool, S_IRUGO);
74 75
75static int __read_mostly fasteoi = 1; 76static bool __read_mostly fasteoi = 1;
76module_param(fasteoi, bool, S_IRUGO); 77module_param(fasteoi, bool, S_IRUGO);
77 78
78/* 79/*
@@ -80,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO);
80 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 81 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
81 * use VMX instructions. 82 * use VMX instructions.
82 */ 83 */
83static int __read_mostly nested = 0; 84static bool __read_mostly nested = 0;
84module_param(nested, bool, S_IRUGO); 85module_param(nested, bool, S_IRUGO);
85 86
86#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 87#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
@@ -1747,7 +1748,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1747 int save_nmsrs, index; 1748 int save_nmsrs, index;
1748 unsigned long *msr_bitmap; 1749 unsigned long *msr_bitmap;
1749 1750
1750 vmx_load_host_state(vmx);
1751 save_nmsrs = 0; 1751 save_nmsrs = 0;
1752#ifdef CONFIG_X86_64 1752#ifdef CONFIG_X86_64
1753 if (is_long_mode(&vmx->vcpu)) { 1753 if (is_long_mode(&vmx->vcpu)) {
@@ -1956,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
1956#endif 1956#endif
1957 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 1957 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1958 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 1958 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1959 CPU_BASED_RDPMC_EXITING |
1959 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1960 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1960 /* 1961 /*
1961 * We can allow some features even when not supported by the 1962 * We can allow some features even when not supported by the
@@ -2142,12 +2143,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2142 return 1; 2143 return 1;
2143 /* Otherwise falls through */ 2144 /* Otherwise falls through */
2144 default: 2145 default:
2145 vmx_load_host_state(to_vmx(vcpu));
2146 if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) 2146 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2147 return 0; 2147 return 0;
2148 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2148 msr = find_msr_entry(to_vmx(vcpu), msr_index);
2149 if (msr) { 2149 if (msr) {
2150 vmx_load_host_state(to_vmx(vcpu));
2151 data = msr->data; 2150 data = msr->data;
2152 break; 2151 break;
2153 } 2152 }
@@ -2171,7 +2170,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2171 2170
2172 switch (msr_index) { 2171 switch (msr_index) {
2173 case MSR_EFER: 2172 case MSR_EFER:
2174 vmx_load_host_state(vmx);
2175 ret = kvm_set_msr_common(vcpu, msr_index, data); 2173 ret = kvm_set_msr_common(vcpu, msr_index, data);
2176 break; 2174 break;
2177#ifdef CONFIG_X86_64 2175#ifdef CONFIG_X86_64
@@ -2220,7 +2218,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2220 break; 2218 break;
2221 msr = find_msr_entry(vmx, msr_index); 2219 msr = find_msr_entry(vmx, msr_index);
2222 if (msr) { 2220 if (msr) {
2223 vmx_load_host_state(vmx);
2224 msr->data = data; 2221 msr->data = data;
2225 break; 2222 break;
2226 } 2223 }
@@ -2414,7 +2411,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2414 CPU_BASED_USE_TSC_OFFSETING | 2411 CPU_BASED_USE_TSC_OFFSETING |
2415 CPU_BASED_MWAIT_EXITING | 2412 CPU_BASED_MWAIT_EXITING |
2416 CPU_BASED_MONITOR_EXITING | 2413 CPU_BASED_MONITOR_EXITING |
2417 CPU_BASED_INVLPG_EXITING; 2414 CPU_BASED_INVLPG_EXITING |
2415 CPU_BASED_RDPMC_EXITING;
2418 2416
2419 if (yield_on_hlt) 2417 if (yield_on_hlt)
2420 min |= CPU_BASED_HLT_EXITING; 2418 min |= CPU_BASED_HLT_EXITING;
@@ -2716,11 +2714,13 @@ static gva_t rmode_tss_base(struct kvm *kvm)
2716{ 2714{
2717 if (!kvm->arch.tss_addr) { 2715 if (!kvm->arch.tss_addr) {
2718 struct kvm_memslots *slots; 2716 struct kvm_memslots *slots;
2717 struct kvm_memory_slot *slot;
2719 gfn_t base_gfn; 2718 gfn_t base_gfn;
2720 2719
2721 slots = kvm_memslots(kvm); 2720 slots = kvm_memslots(kvm);
2722 base_gfn = slots->memslots[0].base_gfn + 2721 slot = id_to_memslot(slots, 0);
2723 kvm->memslots->memslots[0].npages - 3; 2722 base_gfn = slot->base_gfn + slot->npages - 3;
2723
2724 return base_gfn << PAGE_SHIFT; 2724 return base_gfn << PAGE_SHIFT;
2725 } 2725 }
2726 return kvm->arch.tss_addr; 2726 return kvm->arch.tss_addr;
@@ -3945,12 +3945,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
3945static void enable_irq_window(struct kvm_vcpu *vcpu) 3945static void enable_irq_window(struct kvm_vcpu *vcpu)
3946{ 3946{
3947 u32 cpu_based_vm_exec_control; 3947 u32 cpu_based_vm_exec_control;
3948 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 3948 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
3949 /* We can get here when nested_run_pending caused 3949 /*
3950 * vmx_interrupt_allowed() to return false. In this case, do 3950 * We get here if vmx_interrupt_allowed() said we can't
3951 * nothing - the interrupt will be injected later. 3951 * inject to L1 now because L2 must run. Ask L2 to exit
3952 * right after entry, so we can inject to L1 more promptly.
3952 */ 3953 */
3954 kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
3953 return; 3955 return;
3956 }
3954 3957
3955 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 3958 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3956 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 3959 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -4077,11 +4080,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4077static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4080static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4078{ 4081{
4079 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { 4082 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
4080 struct vmcs12 *vmcs12; 4083 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4081 if (to_vmx(vcpu)->nested.nested_run_pending) 4084 if (to_vmx(vcpu)->nested.nested_run_pending ||
4085 (vmcs12->idt_vectoring_info_field &
4086 VECTORING_INFO_VALID_MASK))
4082 return 0; 4087 return 0;
4083 nested_vmx_vmexit(vcpu); 4088 nested_vmx_vmexit(vcpu);
4084 vmcs12 = get_vmcs12(vcpu);
4085 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; 4089 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
4086 vmcs12->vm_exit_intr_info = 0; 4090 vmcs12->vm_exit_intr_info = 0;
4087 /* fall through to normal code, but now in L1, not L2 */ 4091 /* fall through to normal code, but now in L1, not L2 */
@@ -4611,6 +4615,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
4611 return 1; 4615 return 1;
4612} 4616}
4613 4617
4618static int handle_rdpmc(struct kvm_vcpu *vcpu)
4619{
4620 int err;
4621
4622 err = kvm_rdpmc(vcpu);
4623 kvm_complete_insn_gp(vcpu, err);
4624
4625 return 1;
4626}
4627
4614static int handle_wbinvd(struct kvm_vcpu *vcpu) 4628static int handle_wbinvd(struct kvm_vcpu *vcpu)
4615{ 4629{
4616 skip_emulated_instruction(vcpu); 4630 skip_emulated_instruction(vcpu);
@@ -5561,6 +5575,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5561 [EXIT_REASON_HLT] = handle_halt, 5575 [EXIT_REASON_HLT] = handle_halt,
5562 [EXIT_REASON_INVD] = handle_invd, 5576 [EXIT_REASON_INVD] = handle_invd,
5563 [EXIT_REASON_INVLPG] = handle_invlpg, 5577 [EXIT_REASON_INVLPG] = handle_invlpg,
5578 [EXIT_REASON_RDPMC] = handle_rdpmc,
5564 [EXIT_REASON_VMCALL] = handle_vmcall, 5579 [EXIT_REASON_VMCALL] = handle_vmcall,
5565 [EXIT_REASON_VMCLEAR] = handle_vmclear, 5580 [EXIT_REASON_VMCLEAR] = handle_vmclear,
5566 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 5581 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c938da2ba00..14d6cadc4ba6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -26,6 +26,7 @@
26#include "tss.h" 26#include "tss.h"
27#include "kvm_cache_regs.h" 27#include "kvm_cache_regs.h"
28#include "x86.h" 28#include "x86.h"
29#include "cpuid.h"
29 30
30#include <linux/clocksource.h> 31#include <linux/clocksource.h>
31#include <linux/interrupt.h> 32#include <linux/interrupt.h>
@@ -82,15 +83,13 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
82#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 83#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
83 84
84static void update_cr8_intercept(struct kvm_vcpu *vcpu); 85static void update_cr8_intercept(struct kvm_vcpu *vcpu);
85static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
86 struct kvm_cpuid_entry2 __user *entries);
87static void process_nmi(struct kvm_vcpu *vcpu); 86static void process_nmi(struct kvm_vcpu *vcpu);
88 87
89struct kvm_x86_ops *kvm_x86_ops; 88struct kvm_x86_ops *kvm_x86_ops;
90EXPORT_SYMBOL_GPL(kvm_x86_ops); 89EXPORT_SYMBOL_GPL(kvm_x86_ops);
91 90
92int ignore_msrs = 0; 91static bool ignore_msrs = 0;
93module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 92module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
94 93
95bool kvm_has_tsc_control; 94bool kvm_has_tsc_control;
96EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 95EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
@@ -574,54 +573,6 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
574} 573}
575EXPORT_SYMBOL_GPL(kvm_set_xcr); 574EXPORT_SYMBOL_GPL(kvm_set_xcr);
576 575
577static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
578{
579 struct kvm_cpuid_entry2 *best;
580
581 best = kvm_find_cpuid_entry(vcpu, 1, 0);
582 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
583}
584
585static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
586{
587 struct kvm_cpuid_entry2 *best;
588
589 best = kvm_find_cpuid_entry(vcpu, 7, 0);
590 return best && (best->ebx & bit(X86_FEATURE_SMEP));
591}
592
593static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
594{
595 struct kvm_cpuid_entry2 *best;
596
597 best = kvm_find_cpuid_entry(vcpu, 7, 0);
598 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
599}
600
601static void update_cpuid(struct kvm_vcpu *vcpu)
602{
603 struct kvm_cpuid_entry2 *best;
604 struct kvm_lapic *apic = vcpu->arch.apic;
605
606 best = kvm_find_cpuid_entry(vcpu, 1, 0);
607 if (!best)
608 return;
609
610 /* Update OSXSAVE bit */
611 if (cpu_has_xsave && best->function == 0x1) {
612 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
613 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
614 best->ecx |= bit(X86_FEATURE_OSXSAVE);
615 }
616
617 if (apic) {
618 if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
619 apic->lapic_timer.timer_mode_mask = 3 << 17;
620 else
621 apic->lapic_timer.timer_mode_mask = 1 << 17;
622 }
623}
624
625int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 576int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
626{ 577{
627 unsigned long old_cr4 = kvm_read_cr4(vcpu); 578 unsigned long old_cr4 = kvm_read_cr4(vcpu);
@@ -655,7 +606,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
655 kvm_mmu_reset_context(vcpu); 606 kvm_mmu_reset_context(vcpu);
656 607
657 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 608 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
658 update_cpuid(vcpu); 609 kvm_update_cpuid(vcpu);
659 610
660 return 0; 611 return 0;
661} 612}
@@ -809,6 +760,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
809} 760}
810EXPORT_SYMBOL_GPL(kvm_get_dr); 761EXPORT_SYMBOL_GPL(kvm_get_dr);
811 762
763bool kvm_rdpmc(struct kvm_vcpu *vcpu)
764{
765 u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
766 u64 data;
767 int err;
768
769 err = kvm_pmu_read_pmc(vcpu, ecx, &data);
770 if (err)
771 return err;
772 kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
773 kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
774 return err;
775}
776EXPORT_SYMBOL_GPL(kvm_rdpmc);
777
812/* 778/*
813 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 779 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
814 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 780 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -1358,12 +1324,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1358 if (page_num >= blob_size) 1324 if (page_num >= blob_size)
1359 goto out; 1325 goto out;
1360 r = -ENOMEM; 1326 r = -ENOMEM;
1361 page = kzalloc(PAGE_SIZE, GFP_KERNEL); 1327 page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
1362 if (!page) 1328 if (IS_ERR(page)) {
1329 r = PTR_ERR(page);
1363 goto out; 1330 goto out;
1364 r = -EFAULT; 1331 }
1365 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
1366 goto out_free;
1367 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 1332 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1368 goto out_free; 1333 goto out_free;
1369 r = 0; 1334 r = 0;
@@ -1652,8 +1617,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1652 * which we perfectly emulate ;-). Any other value should be at least 1617 * which we perfectly emulate ;-). Any other value should be at least
1653 * reported, some guests depend on them. 1618 * reported, some guests depend on them.
1654 */ 1619 */
1655 case MSR_P6_EVNTSEL0:
1656 case MSR_P6_EVNTSEL1:
1657 case MSR_K7_EVNTSEL0: 1620 case MSR_K7_EVNTSEL0:
1658 case MSR_K7_EVNTSEL1: 1621 case MSR_K7_EVNTSEL1:
1659 case MSR_K7_EVNTSEL2: 1622 case MSR_K7_EVNTSEL2:
@@ -1665,8 +1628,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1665 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1628 /* at least RHEL 4 unconditionally writes to the perfctr registers,
1666 * so we ignore writes to make it happy. 1629 * so we ignore writes to make it happy.
1667 */ 1630 */
1668 case MSR_P6_PERFCTR0:
1669 case MSR_P6_PERFCTR1:
1670 case MSR_K7_PERFCTR0: 1631 case MSR_K7_PERFCTR0:
1671 case MSR_K7_PERFCTR1: 1632 case MSR_K7_PERFCTR1:
1672 case MSR_K7_PERFCTR2: 1633 case MSR_K7_PERFCTR2:
@@ -1703,6 +1664,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1703 default: 1664 default:
1704 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1665 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1705 return xen_hvm_config(vcpu, data); 1666 return xen_hvm_config(vcpu, data);
1667 if (kvm_pmu_msr(vcpu, msr))
1668 return kvm_pmu_set_msr(vcpu, msr, data);
1706 if (!ignore_msrs) { 1669 if (!ignore_msrs) {
1707 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1670 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1708 msr, data); 1671 msr, data);
@@ -1865,10 +1828,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1865 case MSR_K8_SYSCFG: 1828 case MSR_K8_SYSCFG:
1866 case MSR_K7_HWCR: 1829 case MSR_K7_HWCR:
1867 case MSR_VM_HSAVE_PA: 1830 case MSR_VM_HSAVE_PA:
1868 case MSR_P6_PERFCTR0:
1869 case MSR_P6_PERFCTR1:
1870 case MSR_P6_EVNTSEL0:
1871 case MSR_P6_EVNTSEL1:
1872 case MSR_K7_EVNTSEL0: 1831 case MSR_K7_EVNTSEL0:
1873 case MSR_K7_PERFCTR0: 1832 case MSR_K7_PERFCTR0:
1874 case MSR_K8_INT_PENDING_MSG: 1833 case MSR_K8_INT_PENDING_MSG:
@@ -1979,6 +1938,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1979 data = 0xbe702111; 1938 data = 0xbe702111;
1980 break; 1939 break;
1981 default: 1940 default:
1941 if (kvm_pmu_msr(vcpu, msr))
1942 return kvm_pmu_get_msr(vcpu, msr, pdata);
1982 if (!ignore_msrs) { 1943 if (!ignore_msrs) {
1983 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1944 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1984 return 1; 1945 return 1;
@@ -2037,15 +1998,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2037 if (msrs.nmsrs >= MAX_IO_MSRS) 1998 if (msrs.nmsrs >= MAX_IO_MSRS)
2038 goto out; 1999 goto out;
2039 2000
2040 r = -ENOMEM;
2041 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 2001 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2042 entries = kmalloc(size, GFP_KERNEL); 2002 entries = memdup_user(user_msrs->entries, size);
2043 if (!entries) 2003 if (IS_ERR(entries)) {
2004 r = PTR_ERR(entries);
2044 goto out; 2005 goto out;
2045 2006 }
2046 r = -EFAULT;
2047 if (copy_from_user(entries, user_msrs->entries, size))
2048 goto out_free;
2049 2007
2050 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 2008 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2051 if (r < 0) 2009 if (r < 0)
@@ -2265,466 +2223,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2265 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2223 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
2266} 2224}
2267 2225
2268static int is_efer_nx(void)
2269{
2270 unsigned long long efer = 0;
2271
2272 rdmsrl_safe(MSR_EFER, &efer);
2273 return efer & EFER_NX;
2274}
2275
2276static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2277{
2278 int i;
2279 struct kvm_cpuid_entry2 *e, *entry;
2280
2281 entry = NULL;
2282 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2283 e = &vcpu->arch.cpuid_entries[i];
2284 if (e->function == 0x80000001) {
2285 entry = e;
2286 break;
2287 }
2288 }
2289 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
2290 entry->edx &= ~(1 << 20);
2291 printk(KERN_INFO "kvm: guest NX capability removed\n");
2292 }
2293}
2294
2295/* when an old userspace process fills a new kernel module */
2296static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2297 struct kvm_cpuid *cpuid,
2298 struct kvm_cpuid_entry __user *entries)
2299{
2300 int r, i;
2301 struct kvm_cpuid_entry *cpuid_entries;
2302
2303 r = -E2BIG;
2304 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2305 goto out;
2306 r = -ENOMEM;
2307 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
2308 if (!cpuid_entries)
2309 goto out;
2310 r = -EFAULT;
2311 if (copy_from_user(cpuid_entries, entries,
2312 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2313 goto out_free;
2314 for (i = 0; i < cpuid->nent; i++) {
2315 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
2316 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
2317 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
2318 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
2319 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
2320 vcpu->arch.cpuid_entries[i].index = 0;
2321 vcpu->arch.cpuid_entries[i].flags = 0;
2322 vcpu->arch.cpuid_entries[i].padding[0] = 0;
2323 vcpu->arch.cpuid_entries[i].padding[1] = 0;
2324 vcpu->arch.cpuid_entries[i].padding[2] = 0;
2325 }
2326 vcpu->arch.cpuid_nent = cpuid->nent;
2327 cpuid_fix_nx_cap(vcpu);
2328 r = 0;
2329 kvm_apic_set_version(vcpu);
2330 kvm_x86_ops->cpuid_update(vcpu);
2331 update_cpuid(vcpu);
2332
2333out_free:
2334 vfree(cpuid_entries);
2335out:
2336 return r;
2337}
2338
2339static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
2340 struct kvm_cpuid2 *cpuid,
2341 struct kvm_cpuid_entry2 __user *entries)
2342{
2343 int r;
2344
2345 r = -E2BIG;
2346 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2347 goto out;
2348 r = -EFAULT;
2349 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
2350 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
2351 goto out;
2352 vcpu->arch.cpuid_nent = cpuid->nent;
2353 kvm_apic_set_version(vcpu);
2354 kvm_x86_ops->cpuid_update(vcpu);
2355 update_cpuid(vcpu);
2356 return 0;
2357
2358out:
2359 return r;
2360}
2361
2362static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
2363 struct kvm_cpuid2 *cpuid,
2364 struct kvm_cpuid_entry2 __user *entries)
2365{
2366 int r;
2367
2368 r = -E2BIG;
2369 if (cpuid->nent < vcpu->arch.cpuid_nent)
2370 goto out;
2371 r = -EFAULT;
2372 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
2373 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
2374 goto out;
2375 return 0;
2376
2377out:
2378 cpuid->nent = vcpu->arch.cpuid_nent;
2379 return r;
2380}
2381
2382static void cpuid_mask(u32 *word, int wordnum)
2383{
2384 *word &= boot_cpu_data.x86_capability[wordnum];
2385}
2386
2387static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2388 u32 index)
2389{
2390 entry->function = function;
2391 entry->index = index;
2392 cpuid_count(entry->function, entry->index,
2393 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
2394 entry->flags = 0;
2395}
2396
2397static bool supported_xcr0_bit(unsigned bit)
2398{
2399 u64 mask = ((u64)1 << bit);
2400
2401 return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
2402}
2403
2404#define F(x) bit(X86_FEATURE_##x)
2405
2406static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2407 u32 index, int *nent, int maxnent)
2408{
2409 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
2410#ifdef CONFIG_X86_64
2411 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
2412 ? F(GBPAGES) : 0;
2413 unsigned f_lm = F(LM);
2414#else
2415 unsigned f_gbpages = 0;
2416 unsigned f_lm = 0;
2417#endif
2418 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
2419
2420 /* cpuid 1.edx */
2421 const u32 kvm_supported_word0_x86_features =
2422 F(FPU) | F(VME) | F(DE) | F(PSE) |
2423 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
2424 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
2425 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
2426 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
2427 0 /* Reserved, DS, ACPI */ | F(MMX) |
2428 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
2429 0 /* HTT, TM, Reserved, PBE */;
2430 /* cpuid 0x80000001.edx */
2431 const u32 kvm_supported_word1_x86_features =
2432 F(FPU) | F(VME) | F(DE) | F(PSE) |
2433 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
2434 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
2435 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
2436 F(PAT) | F(PSE36) | 0 /* Reserved */ |
2437 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
2438 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
2439 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
2440 /* cpuid 1.ecx */
2441 const u32 kvm_supported_word4_x86_features =
2442 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
2443 0 /* DS-CPL, VMX, SMX, EST */ |
2444 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
2445 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
2446 0 /* Reserved, DCA */ | F(XMM4_1) |
2447 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
2448 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2449 F(F16C) | F(RDRAND);
2450 /* cpuid 0x80000001.ecx */
2451 const u32 kvm_supported_word6_x86_features =
2452 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
2453 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
2454 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2455 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2456
2457 /* cpuid 0xC0000001.edx */
2458 const u32 kvm_supported_word5_x86_features =
2459 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2460 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2461 F(PMM) | F(PMM_EN);
2462
2463 /* cpuid 7.0.ebx */
2464 const u32 kvm_supported_word9_x86_features =
2465 F(SMEP) | F(FSGSBASE) | F(ERMS);
2466
2467 /* all calls to cpuid_count() should be made on the same cpu */
2468 get_cpu();
2469 do_cpuid_1_ent(entry, function, index);
2470 ++*nent;
2471
2472 switch (function) {
2473 case 0:
2474 entry->eax = min(entry->eax, (u32)0xd);
2475 break;
2476 case 1:
2477 entry->edx &= kvm_supported_word0_x86_features;
2478 cpuid_mask(&entry->edx, 0);
2479 entry->ecx &= kvm_supported_word4_x86_features;
2480 cpuid_mask(&entry->ecx, 4);
2481 /* we support x2apic emulation even if host does not support
2482 * it since we emulate x2apic in software */
2483 entry->ecx |= F(X2APIC);
2484 break;
2485 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
2486 * may return different values. This forces us to get_cpu() before
2487 * issuing the first command, and also to emulate this annoying behavior
2488 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
2489 case 2: {
2490 int t, times = entry->eax & 0xff;
2491
2492 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
2493 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2494 for (t = 1; t < times && *nent < maxnent; ++t) {
2495 do_cpuid_1_ent(&entry[t], function, 0);
2496 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
2497 ++*nent;
2498 }
2499 break;
2500 }
2501 /* function 4 has additional index. */
2502 case 4: {
2503 int i, cache_type;
2504
2505 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2506 /* read more entries until cache_type is zero */
2507 for (i = 1; *nent < maxnent; ++i) {
2508 cache_type = entry[i - 1].eax & 0x1f;
2509 if (!cache_type)
2510 break;
2511 do_cpuid_1_ent(&entry[i], function, i);
2512 entry[i].flags |=
2513 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2514 ++*nent;
2515 }
2516 break;
2517 }
2518 case 7: {
2519 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2520 /* Mask ebx against host capbability word 9 */
2521 if (index == 0) {
2522 entry->ebx &= kvm_supported_word9_x86_features;
2523 cpuid_mask(&entry->ebx, 9);
2524 } else
2525 entry->ebx = 0;
2526 entry->eax = 0;
2527 entry->ecx = 0;
2528 entry->edx = 0;
2529 break;
2530 }
2531 case 9:
2532 break;
2533 /* function 0xb has additional index. */
2534 case 0xb: {
2535 int i, level_type;
2536
2537 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2538 /* read more entries until level_type is zero */
2539 for (i = 1; *nent < maxnent; ++i) {
2540 level_type = entry[i - 1].ecx & 0xff00;
2541 if (!level_type)
2542 break;
2543 do_cpuid_1_ent(&entry[i], function, i);
2544 entry[i].flags |=
2545 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2546 ++*nent;
2547 }
2548 break;
2549 }
2550 case 0xd: {
2551 int idx, i;
2552
2553 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2554 for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
2555 do_cpuid_1_ent(&entry[i], function, idx);
2556 if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
2557 continue;
2558 entry[i].flags |=
2559 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2560 ++*nent;
2561 ++i;
2562 }
2563 break;
2564 }
2565 case KVM_CPUID_SIGNATURE: {
2566 char signature[12] = "KVMKVMKVM\0\0";
2567 u32 *sigptr = (u32 *)signature;
2568 entry->eax = 0;
2569 entry->ebx = sigptr[0];
2570 entry->ecx = sigptr[1];
2571 entry->edx = sigptr[2];
2572 break;
2573 }
2574 case KVM_CPUID_FEATURES:
2575 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
2576 (1 << KVM_FEATURE_NOP_IO_DELAY) |
2577 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2578 (1 << KVM_FEATURE_ASYNC_PF) |
2579 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2580
2581 if (sched_info_on())
2582 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
2583
2584 entry->ebx = 0;
2585 entry->ecx = 0;
2586 entry->edx = 0;
2587 break;
2588 case 0x80000000:
2589 entry->eax = min(entry->eax, 0x8000001a);
2590 break;
2591 case 0x80000001:
2592 entry->edx &= kvm_supported_word1_x86_features;
2593 cpuid_mask(&entry->edx, 1);
2594 entry->ecx &= kvm_supported_word6_x86_features;
2595 cpuid_mask(&entry->ecx, 6);
2596 break;
2597 case 0x80000008: {
2598 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
2599 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
2600 unsigned phys_as = entry->eax & 0xff;
2601
2602 if (!g_phys_as)
2603 g_phys_as = phys_as;
2604 entry->eax = g_phys_as | (virt_as << 8);
2605 entry->ebx = entry->edx = 0;
2606 break;
2607 }
2608 case 0x80000019:
2609 entry->ecx = entry->edx = 0;
2610 break;
2611 case 0x8000001a:
2612 break;
2613 case 0x8000001d:
2614 break;
2615 /*Add support for Centaur's CPUID instruction*/
2616 case 0xC0000000:
2617 /*Just support up to 0xC0000004 now*/
2618 entry->eax = min(entry->eax, 0xC0000004);
2619 break;
2620 case 0xC0000001:
2621 entry->edx &= kvm_supported_word5_x86_features;
2622 cpuid_mask(&entry->edx, 5);
2623 break;
2624 case 3: /* Processor serial number */
2625 case 5: /* MONITOR/MWAIT */
2626 case 6: /* Thermal management */
2627 case 0xA: /* Architectural Performance Monitoring */
2628 case 0x80000007: /* Advanced power management */
2629 case 0xC0000002:
2630 case 0xC0000003:
2631 case 0xC0000004:
2632 default:
2633 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
2634 break;
2635 }
2636
2637 kvm_x86_ops->set_supported_cpuid(function, entry);
2638
2639 put_cpu();
2640}
2641
2642#undef F
2643
2644static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2645 struct kvm_cpuid_entry2 __user *entries)
2646{
2647 struct kvm_cpuid_entry2 *cpuid_entries;
2648 int limit, nent = 0, r = -E2BIG;
2649 u32 func;
2650
2651 if (cpuid->nent < 1)
2652 goto out;
2653 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2654 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
2655 r = -ENOMEM;
2656 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
2657 if (!cpuid_entries)
2658 goto out;
2659
2660 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
2661 limit = cpuid_entries[0].eax;
2662 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
2663 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2664 &nent, cpuid->nent);
2665 r = -E2BIG;
2666 if (nent >= cpuid->nent)
2667 goto out_free;
2668
2669 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
2670 limit = cpuid_entries[nent - 1].eax;
2671 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
2672 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2673 &nent, cpuid->nent);
2674
2675
2676
2677 r = -E2BIG;
2678 if (nent >= cpuid->nent)
2679 goto out_free;
2680
2681 /* Add support for Centaur's CPUID instruction. */
2682 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
2683 do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
2684 &nent, cpuid->nent);
2685
2686 r = -E2BIG;
2687 if (nent >= cpuid->nent)
2688 goto out_free;
2689
2690 limit = cpuid_entries[nent - 1].eax;
2691 for (func = 0xC0000001;
2692 func <= limit && nent < cpuid->nent; ++func)
2693 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2694 &nent, cpuid->nent);
2695
2696 r = -E2BIG;
2697 if (nent >= cpuid->nent)
2698 goto out_free;
2699 }
2700
2701 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2702 cpuid->nent);
2703
2704 r = -E2BIG;
2705 if (nent >= cpuid->nent)
2706 goto out_free;
2707
2708 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
2709 cpuid->nent);
2710
2711 r = -E2BIG;
2712 if (nent >= cpuid->nent)
2713 goto out_free;
2714
2715 r = -EFAULT;
2716 if (copy_to_user(entries, cpuid_entries,
2717 nent * sizeof(struct kvm_cpuid_entry2)))
2718 goto out_free;
2719 cpuid->nent = nent;
2720 r = 0;
2721
2722out_free:
2723 vfree(cpuid_entries);
2724out:
2725 return r;
2726}
2727
2728static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2226static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2729 struct kvm_lapic_state *s) 2227 struct kvm_lapic_state *s)
2730{ 2228{
@@ -3042,13 +2540,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3042 r = -EINVAL; 2540 r = -EINVAL;
3043 if (!vcpu->arch.apic) 2541 if (!vcpu->arch.apic)
3044 goto out; 2542 goto out;
3045 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2543 u.lapic = memdup_user(argp, sizeof(*u.lapic));
3046 r = -ENOMEM; 2544 if (IS_ERR(u.lapic)) {
3047 if (!u.lapic) 2545 r = PTR_ERR(u.lapic);
3048 goto out;
3049 r = -EFAULT;
3050 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
3051 goto out; 2546 goto out;
2547 }
2548
3052 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 2549 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3053 if (r) 2550 if (r)
3054 goto out; 2551 goto out;
@@ -3227,14 +2724,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3227 break; 2724 break;
3228 } 2725 }
3229 case KVM_SET_XSAVE: { 2726 case KVM_SET_XSAVE: {
3230 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 2727 u.xsave = memdup_user(argp, sizeof(*u.xsave));
3231 r = -ENOMEM; 2728 if (IS_ERR(u.xsave)) {
3232 if (!u.xsave) 2729 r = PTR_ERR(u.xsave);
3233 break; 2730 goto out;
3234 2731 }
3235 r = -EFAULT;
3236 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
3237 break;
3238 2732
3239 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 2733 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
3240 break; 2734 break;
@@ -3255,15 +2749,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3255 break; 2749 break;
3256 } 2750 }
3257 case KVM_SET_XCRS: { 2751 case KVM_SET_XCRS: {
3258 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 2752 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
3259 r = -ENOMEM; 2753 if (IS_ERR(u.xcrs)) {
3260 if (!u.xcrs) 2754 r = PTR_ERR(u.xcrs);
3261 break; 2755 goto out;
3262 2756 }
3263 r = -EFAULT;
3264 if (copy_from_user(u.xcrs, argp,
3265 sizeof(struct kvm_xcrs)))
3266 break;
3267 2757
3268 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 2758 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3269 break; 2759 break;
@@ -3460,16 +2950,59 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3460 return 0; 2950 return 0;
3461} 2951}
3462 2952
2953/**
2954 * write_protect_slot - write protect a slot for dirty logging
2955 * @kvm: the kvm instance
2956 * @memslot: the slot we protect
2957 * @dirty_bitmap: the bitmap indicating which pages are dirty
2958 * @nr_dirty_pages: the number of dirty pages
2959 *
2960 * We have two ways to find all sptes to protect:
2961 * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
2962 * checks ones that have a spte mapping a page in the slot.
2963 * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
2964 *
2965 * Generally speaking, if there are not so many dirty pages compared to the
2966 * number of shadow pages, we should use the latter.
2967 *
2968 * Note that letting others write into a page marked dirty in the old bitmap
2969 * by using the remaining tlb entry is not a problem. That page will become
2970 * write protected again when we flush the tlb and then be reported dirty to
2971 * the user space by copying the old bitmap.
2972 */
2973static void write_protect_slot(struct kvm *kvm,
2974 struct kvm_memory_slot *memslot,
2975 unsigned long *dirty_bitmap,
2976 unsigned long nr_dirty_pages)
2977{
2978 /* Not many dirty pages compared to # of shadow pages. */
2979 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
2980 unsigned long gfn_offset;
2981
2982 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
2983 unsigned long gfn = memslot->base_gfn + gfn_offset;
2984
2985 spin_lock(&kvm->mmu_lock);
2986 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
2987 spin_unlock(&kvm->mmu_lock);
2988 }
2989 kvm_flush_remote_tlbs(kvm);
2990 } else {
2991 spin_lock(&kvm->mmu_lock);
2992 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
2993 spin_unlock(&kvm->mmu_lock);
2994 }
2995}
2996
3463/* 2997/*
3464 * Get (and clear) the dirty memory log for a memory slot. 2998 * Get (and clear) the dirty memory log for a memory slot.
3465 */ 2999 */
3466int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 3000int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3467 struct kvm_dirty_log *log) 3001 struct kvm_dirty_log *log)
3468{ 3002{
3469 int r, i; 3003 int r;
3470 struct kvm_memory_slot *memslot; 3004 struct kvm_memory_slot *memslot;
3471 unsigned long n; 3005 unsigned long n, nr_dirty_pages;
3472 unsigned long is_dirty = 0;
3473 3006
3474 mutex_lock(&kvm->slots_lock); 3007 mutex_lock(&kvm->slots_lock);
3475 3008
@@ -3477,43 +3010,41 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3477 if (log->slot >= KVM_MEMORY_SLOTS) 3010 if (log->slot >= KVM_MEMORY_SLOTS)
3478 goto out; 3011 goto out;
3479 3012
3480 memslot = &kvm->memslots->memslots[log->slot]; 3013 memslot = id_to_memslot(kvm->memslots, log->slot);
3481 r = -ENOENT; 3014 r = -ENOENT;
3482 if (!memslot->dirty_bitmap) 3015 if (!memslot->dirty_bitmap)
3483 goto out; 3016 goto out;
3484 3017
3485 n = kvm_dirty_bitmap_bytes(memslot); 3018 n = kvm_dirty_bitmap_bytes(memslot);
3486 3019 nr_dirty_pages = memslot->nr_dirty_pages;
3487 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
3488 is_dirty = memslot->dirty_bitmap[i];
3489 3020
3490 /* If nothing is dirty, don't bother messing with page tables. */ 3021 /* If nothing is dirty, don't bother messing with page tables. */
3491 if (is_dirty) { 3022 if (nr_dirty_pages) {
3492 struct kvm_memslots *slots, *old_slots; 3023 struct kvm_memslots *slots, *old_slots;
3493 unsigned long *dirty_bitmap; 3024 unsigned long *dirty_bitmap, *dirty_bitmap_head;
3494 3025
3495 dirty_bitmap = memslot->dirty_bitmap_head; 3026 dirty_bitmap = memslot->dirty_bitmap;
3496 if (memslot->dirty_bitmap == dirty_bitmap) 3027 dirty_bitmap_head = memslot->dirty_bitmap_head;
3497 dirty_bitmap += n / sizeof(long); 3028 if (dirty_bitmap == dirty_bitmap_head)
3498 memset(dirty_bitmap, 0, n); 3029 dirty_bitmap_head += n / sizeof(long);
3030 memset(dirty_bitmap_head, 0, n);
3499 3031
3500 r = -ENOMEM; 3032 r = -ENOMEM;
3501 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3033 slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
3502 if (!slots) 3034 if (!slots)
3503 goto out; 3035 goto out;
3504 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3036
3505 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3037 memslot = id_to_memslot(slots, log->slot);
3506 slots->generation++; 3038 memslot->nr_dirty_pages = 0;
3039 memslot->dirty_bitmap = dirty_bitmap_head;
3040 update_memslots(slots, NULL);
3507 3041
3508 old_slots = kvm->memslots; 3042 old_slots = kvm->memslots;
3509 rcu_assign_pointer(kvm->memslots, slots); 3043 rcu_assign_pointer(kvm->memslots, slots);
3510 synchronize_srcu_expedited(&kvm->srcu); 3044 synchronize_srcu_expedited(&kvm->srcu);
3511 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
3512 kfree(old_slots); 3045 kfree(old_slots);
3513 3046
3514 spin_lock(&kvm->mmu_lock); 3047 write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
3515 kvm_mmu_slot_remove_write_access(kvm, log->slot);
3516 spin_unlock(&kvm->mmu_lock);
3517 3048
3518 r = -EFAULT; 3049 r = -EFAULT;
3519 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) 3050 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
@@ -3658,14 +3189,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
3658 } 3189 }
3659 case KVM_GET_IRQCHIP: { 3190 case KVM_GET_IRQCHIP: {
3660 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3191 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3661 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 3192 struct kvm_irqchip *chip;
3662 3193
3663 r = -ENOMEM; 3194 chip = memdup_user(argp, sizeof(*chip));
3664 if (!chip) 3195 if (IS_ERR(chip)) {
3196 r = PTR_ERR(chip);
3665 goto out; 3197 goto out;
3666 r = -EFAULT; 3198 }
3667 if (copy_from_user(chip, argp, sizeof *chip)) 3199
3668 goto get_irqchip_out;
3669 r = -ENXIO; 3200 r = -ENXIO;
3670 if (!irqchip_in_kernel(kvm)) 3201 if (!irqchip_in_kernel(kvm))
3671 goto get_irqchip_out; 3202 goto get_irqchip_out;
@@ -3684,14 +3215,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
3684 } 3215 }
3685 case KVM_SET_IRQCHIP: { 3216 case KVM_SET_IRQCHIP: {
3686 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3217 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3687 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 3218 struct kvm_irqchip *chip;
3688 3219
3689 r = -ENOMEM; 3220 chip = memdup_user(argp, sizeof(*chip));
3690 if (!chip) 3221 if (IS_ERR(chip)) {
3222 r = PTR_ERR(chip);
3691 goto out; 3223 goto out;
3692 r = -EFAULT; 3224 }
3693 if (copy_from_user(chip, argp, sizeof *chip)) 3225
3694 goto set_irqchip_out;
3695 r = -ENXIO; 3226 r = -ENXIO;
3696 if (!irqchip_in_kernel(kvm)) 3227 if (!irqchip_in_kernel(kvm))
3697 goto set_irqchip_out; 3228 goto set_irqchip_out;
@@ -3898,12 +3429,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
3898 kvm_x86_ops->get_segment(vcpu, var, seg); 3429 kvm_x86_ops->get_segment(vcpu, var, seg);
3899} 3430}
3900 3431
3901static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3432gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3902{
3903 return gpa;
3904}
3905
3906static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3907{ 3433{
3908 gpa_t t_gpa; 3434 gpa_t t_gpa;
3909 struct x86_exception exception; 3435 struct x86_exception exception;
@@ -4087,7 +3613,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4087 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 3613 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4088 if (ret < 0) 3614 if (ret < 0)
4089 return 0; 3615 return 0;
4090 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 3616 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
4091 return 1; 3617 return 1;
4092} 3618}
4093 3619
@@ -4324,7 +3850,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4324 if (!exchanged) 3850 if (!exchanged)
4325 return X86EMUL_CMPXCHG_FAILED; 3851 return X86EMUL_CMPXCHG_FAILED;
4326 3852
4327 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); 3853 kvm_mmu_pte_write(vcpu, gpa, new, bytes);
4328 3854
4329 return X86EMUL_CONTINUE; 3855 return X86EMUL_CONTINUE;
4330 3856
@@ -4349,32 +3875,24 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4349 return r; 3875 return r;
4350} 3876}
4351 3877
4352 3878static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
4353static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, 3879 unsigned short port, void *val,
4354 int size, unsigned short port, void *val, 3880 unsigned int count, bool in)
4355 unsigned int count)
4356{ 3881{
4357 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3882 trace_kvm_pio(!in, port, size, count);
4358
4359 if (vcpu->arch.pio.count)
4360 goto data_avail;
4361
4362 trace_kvm_pio(0, port, size, count);
4363 3883
4364 vcpu->arch.pio.port = port; 3884 vcpu->arch.pio.port = port;
4365 vcpu->arch.pio.in = 1; 3885 vcpu->arch.pio.in = in;
4366 vcpu->arch.pio.count = count; 3886 vcpu->arch.pio.count = count;
4367 vcpu->arch.pio.size = size; 3887 vcpu->arch.pio.size = size;
4368 3888
4369 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3889 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
4370 data_avail:
4371 memcpy(val, vcpu->arch.pio_data, size * count);
4372 vcpu->arch.pio.count = 0; 3890 vcpu->arch.pio.count = 0;
4373 return 1; 3891 return 1;
4374 } 3892 }
4375 3893
4376 vcpu->run->exit_reason = KVM_EXIT_IO; 3894 vcpu->run->exit_reason = KVM_EXIT_IO;
4377 vcpu->run->io.direction = KVM_EXIT_IO_IN; 3895 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
4378 vcpu->run->io.size = size; 3896 vcpu->run->io.size = size;
4379 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3897 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4380 vcpu->run->io.count = count; 3898 vcpu->run->io.count = count;
@@ -4383,36 +3901,37 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4383 return 0; 3901 return 0;
4384} 3902}
4385 3903
4386static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, 3904static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4387 int size, unsigned short port, 3905 int size, unsigned short port, void *val,
4388 const void *val, unsigned int count) 3906 unsigned int count)
4389{ 3907{
4390 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3908 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3909 int ret;
4391 3910
4392 trace_kvm_pio(1, port, size, count); 3911 if (vcpu->arch.pio.count)
4393 3912 goto data_avail;
4394 vcpu->arch.pio.port = port;
4395 vcpu->arch.pio.in = 0;
4396 vcpu->arch.pio.count = count;
4397 vcpu->arch.pio.size = size;
4398
4399 memcpy(vcpu->arch.pio_data, val, size * count);
4400 3913
4401 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3914 ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
3915 if (ret) {
3916data_avail:
3917 memcpy(val, vcpu->arch.pio_data, size * count);
4402 vcpu->arch.pio.count = 0; 3918 vcpu->arch.pio.count = 0;
4403 return 1; 3919 return 1;
4404 } 3920 }
4405 3921
4406 vcpu->run->exit_reason = KVM_EXIT_IO;
4407 vcpu->run->io.direction = KVM_EXIT_IO_OUT;
4408 vcpu->run->io.size = size;
4409 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4410 vcpu->run->io.count = count;
4411 vcpu->run->io.port = port;
4412
4413 return 0; 3922 return 0;
4414} 3923}
4415 3924
3925static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
3926 int size, unsigned short port,
3927 const void *val, unsigned int count)
3928{
3929 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3930
3931 memcpy(vcpu->arch.pio_data, val, size * count);
3932 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
3933}
3934
4416static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 3935static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4417{ 3936{
4418 return kvm_x86_ops->get_segment_base(vcpu, seg); 3937 return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -4627,6 +4146,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4627 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); 4146 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
4628} 4147}
4629 4148
4149static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
4150 u32 pmc, u64 *pdata)
4151{
4152 return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
4153}
4154
4630static void emulator_halt(struct x86_emulate_ctxt *ctxt) 4155static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4631{ 4156{
4632 emul_to_vcpu(ctxt)->arch.halt_request = 1; 4157 emul_to_vcpu(ctxt)->arch.halt_request = 1;
@@ -4679,6 +4204,7 @@ static struct x86_emulate_ops emulate_ops = {
4679 .set_dr = emulator_set_dr, 4204 .set_dr = emulator_set_dr,
4680 .set_msr = emulator_set_msr, 4205 .set_msr = emulator_set_msr,
4681 .get_msr = emulator_get_msr, 4206 .get_msr = emulator_get_msr,
4207 .read_pmc = emulator_read_pmc,
4682 .halt = emulator_halt, 4208 .halt = emulator_halt,
4683 .wbinvd = emulator_wbinvd, 4209 .wbinvd = emulator_wbinvd,
4684 .fix_hypercall = emulator_fix_hypercall, 4210 .fix_hypercall = emulator_fix_hypercall,
@@ -4836,6 +4362,50 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4836 return false; 4362 return false;
4837} 4363}
4838 4364
4365static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4366 unsigned long cr2, int emulation_type)
4367{
4368 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4369 unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
4370
4371 last_retry_eip = vcpu->arch.last_retry_eip;
4372 last_retry_addr = vcpu->arch.last_retry_addr;
4373
4374 /*
4375 * If the emulation is caused by #PF and it is non-page_table
4376 * writing instruction, it means the VM-EXIT is caused by shadow
4377 * page protected, we can zap the shadow page and retry this
4378 * instruction directly.
4379 *
4380 * Note: if the guest uses a non-page-table modifying instruction
4381 * on the PDE that points to the instruction, then we will unmap
4382 * the instruction and go to an infinite loop. So, we cache the
4383 * last retried eip and the last fault address, if we meet the eip
4384 * and the address again, we can break out of the potential infinite
4385 * loop.
4386 */
4387 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
4388
4389 if (!(emulation_type & EMULTYPE_RETRY))
4390 return false;
4391
4392 if (x86_page_table_writing_insn(ctxt))
4393 return false;
4394
4395 if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
4396 return false;
4397
4398 vcpu->arch.last_retry_eip = ctxt->eip;
4399 vcpu->arch.last_retry_addr = cr2;
4400
4401 if (!vcpu->arch.mmu.direct_map)
4402 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4403
4404 kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4405
4406 return true;
4407}
4408
4839int x86_emulate_instruction(struct kvm_vcpu *vcpu, 4409int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4840 unsigned long cr2, 4410 unsigned long cr2,
4841 int emulation_type, 4411 int emulation_type,
@@ -4877,6 +4447,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4877 return EMULATE_DONE; 4447 return EMULATE_DONE;
4878 } 4448 }
4879 4449
4450 if (retry_instruction(ctxt, cr2, emulation_type))
4451 return EMULATE_DONE;
4452
4880 /* this is needed for vmware backdoor interface to work since it 4453 /* this is needed for vmware backdoor interface to work since it
4881 changes registers values during IO operation */ 4454 changes registers values during IO operation */
4882 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4455 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
@@ -5095,17 +4668,17 @@ static void kvm_timer_init(void)
5095 4668
5096static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4669static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
5097 4670
5098static int kvm_is_in_guest(void) 4671int kvm_is_in_guest(void)
5099{ 4672{
5100 return percpu_read(current_vcpu) != NULL; 4673 return __this_cpu_read(current_vcpu) != NULL;
5101} 4674}
5102 4675
5103static int kvm_is_user_mode(void) 4676static int kvm_is_user_mode(void)
5104{ 4677{
5105 int user_mode = 3; 4678 int user_mode = 3;
5106 4679
5107 if (percpu_read(current_vcpu)) 4680 if (__this_cpu_read(current_vcpu))
5108 user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); 4681 user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
5109 4682
5110 return user_mode != 0; 4683 return user_mode != 0;
5111} 4684}
@@ -5114,8 +4687,8 @@ static unsigned long kvm_get_guest_ip(void)
5114{ 4687{
5115 unsigned long ip = 0; 4688 unsigned long ip = 0;
5116 4689
5117 if (percpu_read(current_vcpu)) 4690 if (__this_cpu_read(current_vcpu))
5118 ip = kvm_rip_read(percpu_read(current_vcpu)); 4691 ip = kvm_rip_read(__this_cpu_read(current_vcpu));
5119 4692
5120 return ip; 4693 return ip;
5121} 4694}
@@ -5128,13 +4701,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
5128 4701
5129void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) 4702void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
5130{ 4703{
5131 percpu_write(current_vcpu, vcpu); 4704 __this_cpu_write(current_vcpu, vcpu);
5132} 4705}
5133EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); 4706EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
5134 4707
5135void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) 4708void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
5136{ 4709{
5137 percpu_write(current_vcpu, NULL); 4710 __this_cpu_write(current_vcpu, NULL);
5138} 4711}
5139EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); 4712EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
5140 4713
@@ -5233,15 +4806,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5233} 4806}
5234EXPORT_SYMBOL_GPL(kvm_emulate_halt); 4807EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5235 4808
5236static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
5237 unsigned long a1)
5238{
5239 if (is_long_mode(vcpu))
5240 return a0;
5241 else
5242 return a0 | ((gpa_t)a1 << 32);
5243}
5244
5245int kvm_hv_hypercall(struct kvm_vcpu *vcpu) 4809int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5246{ 4810{
5247 u64 param, ingpa, outgpa, ret; 4811 u64 param, ingpa, outgpa, ret;
@@ -5337,9 +4901,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5337 case KVM_HC_VAPIC_POLL_IRQ: 4901 case KVM_HC_VAPIC_POLL_IRQ:
5338 ret = 0; 4902 ret = 0;
5339 break; 4903 break;
5340 case KVM_HC_MMU_OP:
5341 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
5342 break;
5343 default: 4904 default:
5344 ret = -KVM_ENOSYS; 4905 ret = -KVM_ENOSYS;
5345 break; 4906 break;
@@ -5369,125 +4930,6 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5369 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); 4930 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5370} 4931}
5371 4932
5372static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
5373{
5374 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
5375 int j, nent = vcpu->arch.cpuid_nent;
5376
5377 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
5378 /* when no next entry is found, the current entry[i] is reselected */
5379 for (j = i + 1; ; j = (j + 1) % nent) {
5380 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
5381 if (ej->function == e->function) {
5382 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
5383 return j;
5384 }
5385 }
5386 return 0; /* silence gcc, even though control never reaches here */
5387}
5388
5389/* find an entry with matching function, matching index (if needed), and that
5390 * should be read next (if it's stateful) */
5391static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
5392 u32 function, u32 index)
5393{
5394 if (e->function != function)
5395 return 0;
5396 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
5397 return 0;
5398 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
5399 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
5400 return 0;
5401 return 1;
5402}
5403
5404struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
5405 u32 function, u32 index)
5406{
5407 int i;
5408 struct kvm_cpuid_entry2 *best = NULL;
5409
5410 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
5411 struct kvm_cpuid_entry2 *e;
5412
5413 e = &vcpu->arch.cpuid_entries[i];
5414 if (is_matching_cpuid_entry(e, function, index)) {
5415 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
5416 move_to_next_stateful_cpuid_entry(vcpu, i);
5417 best = e;
5418 break;
5419 }
5420 }
5421 return best;
5422}
5423EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
5424
5425int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
5426{
5427 struct kvm_cpuid_entry2 *best;
5428
5429 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
5430 if (!best || best->eax < 0x80000008)
5431 goto not_found;
5432 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
5433 if (best)
5434 return best->eax & 0xff;
5435not_found:
5436 return 36;
5437}
5438
5439/*
5440 * If no match is found, check whether we exceed the vCPU's limit
5441 * and return the content of the highest valid _standard_ leaf instead.
5442 * This is to satisfy the CPUID specification.
5443 */
5444static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
5445 u32 function, u32 index)
5446{
5447 struct kvm_cpuid_entry2 *maxlevel;
5448
5449 maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
5450 if (!maxlevel || maxlevel->eax >= function)
5451 return NULL;
5452 if (function & 0x80000000) {
5453 maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
5454 if (!maxlevel)
5455 return NULL;
5456 }
5457 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
5458}
5459
5460void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
5461{
5462 u32 function, index;
5463 struct kvm_cpuid_entry2 *best;
5464
5465 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
5466 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
5467 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
5468 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
5469 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
5470 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
5471 best = kvm_find_cpuid_entry(vcpu, function, index);
5472
5473 if (!best)
5474 best = check_cpuid_limit(vcpu, function, index);
5475
5476 if (best) {
5477 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
5478 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
5479 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
5480 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
5481 }
5482 kvm_x86_ops->skip_emulated_instruction(vcpu);
5483 trace_kvm_cpuid(function,
5484 kvm_register_read(vcpu, VCPU_REGS_RAX),
5485 kvm_register_read(vcpu, VCPU_REGS_RBX),
5486 kvm_register_read(vcpu, VCPU_REGS_RCX),
5487 kvm_register_read(vcpu, VCPU_REGS_RDX));
5488}
5489EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
5490
5491/* 4933/*
5492 * Check if userspace requested an interrupt window, and that the 4934 * Check if userspace requested an interrupt window, and that the
5493 * interrupt window is open. 4935 * interrupt window is open.
@@ -5648,6 +5090,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5648 int r; 5090 int r;
5649 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5091 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5650 vcpu->run->request_interrupt_window; 5092 vcpu->run->request_interrupt_window;
5093 bool req_immediate_exit = 0;
5651 5094
5652 if (vcpu->requests) { 5095 if (vcpu->requests) {
5653 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 5096 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5687,7 +5130,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5687 record_steal_time(vcpu); 5130 record_steal_time(vcpu);
5688 if (kvm_check_request(KVM_REQ_NMI, vcpu)) 5131 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5689 process_nmi(vcpu); 5132 process_nmi(vcpu);
5690 5133 req_immediate_exit =
5134 kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
5135 if (kvm_check_request(KVM_REQ_PMU, vcpu))
5136 kvm_handle_pmu_event(vcpu);
5137 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5138 kvm_deliver_pmi(vcpu);
5691 } 5139 }
5692 5140
5693 r = kvm_mmu_reload(vcpu); 5141 r = kvm_mmu_reload(vcpu);
@@ -5738,6 +5186,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5738 5186
5739 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5187 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5740 5188
5189 if (req_immediate_exit)
5190 smp_send_reschedule(vcpu->cpu);
5191
5741 kvm_guest_enter(); 5192 kvm_guest_enter();
5742 5193
5743 if (unlikely(vcpu->arch.switch_db_regs)) { 5194 if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -5943,10 +5394,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5943 if (r <= 0) 5394 if (r <= 0)
5944 goto out; 5395 goto out;
5945 5396
5946 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
5947 kvm_register_write(vcpu, VCPU_REGS_RAX,
5948 kvm_run->hypercall.ret);
5949
5950 r = __vcpu_run(vcpu); 5397 r = __vcpu_run(vcpu);
5951 5398
5952out: 5399out:
@@ -6148,7 +5595,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6148 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5595 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
6149 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5596 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
6150 if (sregs->cr4 & X86_CR4_OSXSAVE) 5597 if (sregs->cr4 & X86_CR4_OSXSAVE)
6151 update_cpuid(vcpu); 5598 kvm_update_cpuid(vcpu);
6152 5599
6153 idx = srcu_read_lock(&vcpu->kvm->srcu); 5600 idx = srcu_read_lock(&vcpu->kvm->srcu);
6154 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5601 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
@@ -6425,6 +5872,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6425 kvm_async_pf_hash_reset(vcpu); 5872 kvm_async_pf_hash_reset(vcpu);
6426 vcpu->arch.apf.halted = false; 5873 vcpu->arch.apf.halted = false;
6427 5874
5875 kvm_pmu_reset(vcpu);
5876
6428 return kvm_x86_ops->vcpu_reset(vcpu); 5877 return kvm_x86_ops->vcpu_reset(vcpu);
6429} 5878}
6430 5879
@@ -6473,10 +5922,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6473 kvm = vcpu->kvm; 5922 kvm = vcpu->kvm;
6474 5923
6475 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 5924 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6476 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
6477 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
6478 vcpu->arch.mmu.translate_gpa = translate_gpa;
6479 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
6480 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5925 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
6481 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5926 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6482 else 5927 else
@@ -6513,6 +5958,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6513 goto fail_free_mce_banks; 5958 goto fail_free_mce_banks;
6514 5959
6515 kvm_async_pf_hash_reset(vcpu); 5960 kvm_async_pf_hash_reset(vcpu);
5961 kvm_pmu_init(vcpu);
6516 5962
6517 return 0; 5963 return 0;
6518fail_free_mce_banks: 5964fail_free_mce_banks:
@@ -6531,6 +5977,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6531{ 5977{
6532 int idx; 5978 int idx;
6533 5979
5980 kvm_pmu_destroy(vcpu);
6534 kfree(vcpu->arch.mce_banks); 5981 kfree(vcpu->arch.mce_banks);
6535 kvm_free_lapic(vcpu); 5982 kvm_free_lapic(vcpu);
6536 idx = srcu_read_lock(&vcpu->kvm->srcu); 5983 idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d36fe237c665..cb80c293cdd8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -33,9 +33,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
33 return (nr == BP_VECTOR) || (nr == OF_VECTOR); 33 return (nr == BP_VECTOR) || (nr == OF_VECTOR);
34} 34}
35 35
36struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
37 u32 function, u32 index);
38
39static inline bool is_protmode(struct kvm_vcpu *vcpu) 36static inline bool is_protmode(struct kvm_vcpu *vcpu)
40{ 37{
41 return kvm_read_cr0_bits(vcpu, X86_CR0_PE); 38 return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
@@ -125,4 +122,6 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
125 gva_t addr, void *val, unsigned int bytes, 122 gva_t addr, void *val, unsigned int bytes,
126 struct x86_exception *exception); 123 struct x86_exception *exception);
127 124
125extern u64 host_xcr0;
126
128#endif 127#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index cf4603ba866f..642d8805bc1b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void)
856} 856}
857 857
858/* 858/*
859 * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so 859 * Interrupt descriptors are allocated as-needed, but low-numbered ones are
860 * rather than set them in lguest_init_IRQ we are called here every time an 860 * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
861 * lguest device needs an interrupt. 861 * tells us the irq is already used: other errors (ie. ENOMEM) we take
862 * 862 * seriously.
863 * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
864 * pass that up!
865 */ 863 */
866void lguest_setup_irq(unsigned int irq) 864int lguest_setup_irq(unsigned int irq)
867{ 865{
868 irq_alloc_desc_at(irq, 0); 866 int err;
867
868 /* Returns -ve error or vector number. */
869 err = irq_alloc_desc_at(irq, 0);
870 if (err < 0 && err != -EEXIST)
871 return err;
872
869 irq_set_chip_and_handler_name(irq, &lguest_irq_controller, 873 irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
870 handle_level_irq, "level"); 874 handle_level_irq, "level");
875 return 0;
871} 876}
872 877
873/* 878/*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a298914058f9..6cabf6570d64 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,6 +3,7 @@
3#include <linux/ioport.h> 3#include <linux/ioport.h>
4#include <linux/swap.h> 4#include <linux/swap.h>
5#include <linux/memblock.h> 5#include <linux/memblock.h>
6#include <linux/bootmem.h> /* for max_low_pfn */
6 7
7#include <asm/cacheflush.h> 8#include <asm/cacheflush.h>
8#include <asm/e820.h> 9#include <asm/e820.h>
@@ -15,6 +16,7 @@
15#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
16#include <asm/tlb.h> 17#include <asm/tlb.h>
17#include <asm/proto.h> 18#include <asm/proto.h>
19#include <asm/dma.h> /* for MAX_DMA_PFN */
18 20
19unsigned long __initdata pgt_buf_start; 21unsigned long __initdata pgt_buf_start;
20unsigned long __meminitdata pgt_buf_end; 22unsigned long __meminitdata pgt_buf_end;
@@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end)
392 free_init_pages("initrd memory", start, PAGE_ALIGN(end)); 394 free_init_pages("initrd memory", start, PAGE_ALIGN(end));
393} 395}
394#endif 396#endif
397
398void __init zone_sizes_init(void)
399{
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401
402 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
403
404#ifdef CONFIG_ZONE_DMA
405 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
406#endif
407#ifdef CONFIG_ZONE_DMA32
408 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
409#endif
410 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
411#ifdef CONFIG_HIGHMEM
412 max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
413#endif
414
415 free_area_init_nodes(max_zone_pfns);
416}
417
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0c1da394a634..8663f6c47ccb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -668,22 +668,6 @@ void __init initmem_init(void)
668} 668}
669#endif /* !CONFIG_NEED_MULTIPLE_NODES */ 669#endif /* !CONFIG_NEED_MULTIPLE_NODES */
670 670
671static void __init zone_sizes_init(void)
672{
673 unsigned long max_zone_pfns[MAX_NR_ZONES];
674 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
675#ifdef CONFIG_ZONE_DMA
676 max_zone_pfns[ZONE_DMA] =
677 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
678#endif
679 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
680#ifdef CONFIG_HIGHMEM
681 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
682#endif
683
684 free_area_init_nodes(max_zone_pfns);
685}
686
687void __init setup_bootmem_allocator(void) 671void __init setup_bootmem_allocator(void)
688{ 672{
689 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 673 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
@@ -754,6 +738,17 @@ void __init mem_init(void)
754#ifdef CONFIG_FLATMEM 738#ifdef CONFIG_FLATMEM
755 BUG_ON(!mem_map); 739 BUG_ON(!mem_map);
756#endif 740#endif
741 /*
742 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
743 * be done before free_all_bootmem(). Memblock use free low memory for
744 * temporary data (see find_range_array()) and for this purpose can use
745 * pages that was already passed to the buddy allocator, hence marked as
746 * not accessible in the page tables when compiled with
747 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
748 * important here.
749 */
750 set_highmem_pages_init();
751
757 /* this will put all low memory onto the freelists */ 752 /* this will put all low memory onto the freelists */
758 totalram_pages += free_all_bootmem(); 753 totalram_pages += free_all_bootmem();
759 754
@@ -765,8 +760,6 @@ void __init mem_init(void)
765 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 760 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
766 reservedpages++; 761 reservedpages++;
767 762
768 set_highmem_pages_init();
769
770 codesize = (unsigned long) &_etext - (unsigned long) &_text; 763 codesize = (unsigned long) &_etext - (unsigned long) &_text;
771 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 764 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
772 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 765 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a8a56ce3a962..436a0309db33 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -614,15 +614,6 @@ void __init initmem_init(void)
614 614
615void __init paging_init(void) 615void __init paging_init(void)
616{ 616{
617 unsigned long max_zone_pfns[MAX_NR_ZONES];
618
619 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
620#ifdef CONFIG_ZONE_DMA
621 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
622#endif
623 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
624 max_zone_pfns[ZONE_NORMAL] = max_pfn;
625
626 sparse_memory_present_with_active_regions(MAX_NUMNODES); 617 sparse_memory_present_with_active_regions(MAX_NUMNODES);
627 sparse_init(); 618 sparse_init();
628 619
@@ -634,7 +625,7 @@ void __init paging_init(void)
634 */ 625 */
635 node_clear_state(0, N_NORMAL_MEMORY); 626 node_clear_state(0, N_NORMAL_MEMORY);
636 627
637 free_area_init_nodes(max_zone_pfns); 628 zone_sizes_init();
638} 629}
639 630
640/* 631/*
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 4b5ba85eb5c9..845df6835f9f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void)
75 */ 75 */
76 if (current->flags & PF_RANDOMIZE) { 76 if (current->flags & PF_RANDOMIZE) {
77 if (mmap_is_ia32()) 77 if (mmap_is_ia32())
78 rnd = (long)get_random_int() % (1<<8); 78 rnd = get_random_int() % (1<<8);
79 else 79 else
80 rnd = (long)(get_random_int() % (1<<28)); 80 rnd = get_random_int() % (1<<28);
81 } 81 }
82 return rnd << PAGE_SHIFT; 82 return rnd << PAGE_SHIFT;
83} 83}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index de54b9b278a7..dc0b727742f4 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -75,8 +75,8 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
75 75
76/* module parameters */ 76/* module parameters */
77static unsigned long filter_offset; 77static unsigned long filter_offset;
78static int nommiotrace; 78static bool nommiotrace;
79static int trace_pc; 79static bool trace_pc;
80 80
81module_param(filter_offset, ulong, 0); 81module_param(filter_offset, ulong, 0);
82module_param(nommiotrace, bool, 0); 82module_param(nommiotrace, bool, 0);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 496f494593bf..19d3fa08b119 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu)
110 * Allocate node_to_cpumask_map based on number of available nodes 110 * Allocate node_to_cpumask_map based on number of available nodes
111 * Requires node_possible_map to be valid. 111 * Requires node_possible_map to be valid.
112 * 112 *
113 * Note: node_to_cpumask() is not valid until after this is done. 113 * Note: cpumask_of_node() is not valid until after this is done.
114 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 114 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
115 */ 115 */
116void __init setup_node_to_cpumask_map(void) 116void __init setup_node_to_cpumask_map(void)
@@ -422,8 +422,9 @@ static int __init numa_alloc_distance(void)
422 * calls are ignored until the distance table is reset with 422 * calls are ignored until the distance table is reset with
423 * numa_reset_distance(). 423 * numa_reset_distance().
424 * 424 *
425 * If @from or @to is higher than the highest known node at the time of 425 * If @from or @to is higher than the highest known node or lower than zero
426 * table creation or @distance doesn't make sense, the call is ignored. 426 * at the time of table creation or @distance doesn't make sense, the call
427 * is ignored.
427 * This is to allow simplification of specific NUMA config implementations. 428 * This is to allow simplification of specific NUMA config implementations.
428 */ 429 */
429void __init numa_set_distance(int from, int to, int distance) 430void __init numa_set_distance(int from, int to, int distance)
@@ -431,8 +432,9 @@ void __init numa_set_distance(int from, int to, int distance)
431 if (!numa_distance && numa_alloc_distance() < 0) 432 if (!numa_distance && numa_alloc_distance() < 0)
432 return; 433 return;
433 434
434 if (from >= numa_distance_cnt || to >= numa_distance_cnt) { 435 if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
435 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", 436 from < 0 || to < 0) {
437 pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
436 from, to, distance); 438 from, to, distance);
437 return; 439 return;
438 } 440 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index eda2acbb6e81..e1ebde315210 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1334,12 +1334,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1334 } 1334 }
1335 1335
1336 /* 1336 /*
1337 * If page allocator is not up yet then do not call c_p_a():
1338 */
1339 if (!debug_pagealloc_enabled)
1340 return;
1341
1342 /*
1343 * The return value is ignored as the calls cannot fail. 1337 * The return value is ignored as the calls cannot fail.
1344 * Large pages for identity mappings are not used at boot time 1338 * Large pages for identity mappings are not used at boot time
1345 * and hence no memory allocations during large page split. 1339 * and hence no memory allocations during large page split.
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index fd61b3fb7341..1c1c4f46a7c1 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -109,6 +109,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
109 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) 109 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
110 return; 110 return;
111 pxm = pa->proximity_domain_lo; 111 pxm = pa->proximity_domain_lo;
112 if (acpi_srat_revision >= 2)
113 pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
112 node = setup_node(pxm); 114 node = setup_node(pxm);
113 if (node < 0) { 115 if (node < 0) {
114 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); 116 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -160,6 +162,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
160 start = ma->base_address; 162 start = ma->base_address;
161 end = start + ma->length; 163 end = start + ma->length;
162 pxm = ma->proximity_domain; 164 pxm = ma->proximity_domain;
165 if (acpi_srat_revision <= 1)
166 pxm &= 0xff;
163 node = setup_node(pxm); 167 node = setup_node(pxm);
164 if (node < 0) { 168 if (node < 0) {
165 printk(KERN_ERR "SRAT: Too many proximity domains.\n"); 169 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 6b8759f7634e..e76e18c94a3c 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,11 +15,12 @@ obj-$(CONFIG_X86_VISWS) += visws.o
15 15
16obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 16obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
17 17
18obj-$(CONFIG_X86_MRST) += mrst.o 18obj-$(CONFIG_X86_INTEL_MID) += mrst.o
19 19
20obj-y += common.o early.o 20obj-y += common.o early.o
21obj-y += amd_bus.o bus_numa.o 21obj-y += bus_numa.o
22 22
23obj-$(CONFIG_AMD_NB) += amd_bus.o
23obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o 24obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
24 25
25ifeq ($(CONFIG_PCI_DEBUG),y) 26ifeq ($(CONFIG_PCI_DEBUG),y)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 404f21a3ff9e..a312e76063a7 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,7 +12,7 @@ struct pci_root_info {
12 char *name; 12 char *name;
13 unsigned int res_num; 13 unsigned int res_num;
14 struct resource *res; 14 struct resource *res;
15 struct pci_bus *bus; 15 struct list_head *resources;
16 int busnum; 16 int busnum;
17}; 17};
18 18
@@ -24,6 +24,12 @@ static int __init set_use_crs(const struct dmi_system_id *id)
24 return 0; 24 return 0;
25} 25}
26 26
27static int __init set_nouse_crs(const struct dmi_system_id *id)
28{
29 pci_use_crs = false;
30 return 0;
31}
32
27static const struct dmi_system_id pci_use_crs_table[] __initconst = { 33static const struct dmi_system_id pci_use_crs_table[] __initconst = {
28 /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ 34 /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
29 { 35 {
@@ -54,6 +60,29 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
54 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), 60 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
55 }, 61 },
56 }, 62 },
63
64 /* Now for the blacklist.. */
65
66 /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
67 {
68 .callback = set_nouse_crs,
69 .ident = "Dell Studio 1557",
70 .matches = {
71 DMI_MATCH(DMI_BOARD_VENDOR, "Dell Inc."),
72 DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"),
73 DMI_MATCH(DMI_BIOS_VERSION, "A09"),
74 },
75 },
76 /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
77 {
78 .callback = set_nouse_crs,
79 .ident = "Thinkpad SL510",
80 .matches = {
81 DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
82 DMI_MATCH(DMI_BOARD_NAME, "2847DFG"),
83 DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"),
84 },
85 },
57 {} 86 {}
58}; 87};
59 88
@@ -149,7 +178,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
149 struct acpi_resource_address64 addr; 178 struct acpi_resource_address64 addr;
150 acpi_status status; 179 acpi_status status;
151 unsigned long flags; 180 unsigned long flags;
152 u64 start, end; 181 u64 start, orig_end, end;
153 182
154 status = resource_to_addr(acpi_res, &addr); 183 status = resource_to_addr(acpi_res, &addr);
155 if (!ACPI_SUCCESS(status)) 184 if (!ACPI_SUCCESS(status))
@@ -165,7 +194,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
165 return AE_OK; 194 return AE_OK;
166 195
167 start = addr.minimum + addr.translation_offset; 196 start = addr.minimum + addr.translation_offset;
168 end = addr.maximum + addr.translation_offset; 197 orig_end = end = addr.maximum + addr.translation_offset;
198
199 /* Exclude non-addressable range or non-addressable portion of range */
200 end = min(end, (u64)iomem_resource.end);
201 if (end <= start) {
202 dev_info(&info->bridge->dev,
203 "host bridge window [%#llx-%#llx] "
204 "(ignored, not CPU addressable)\n", start, orig_end);
205 return AE_OK;
206 } else if (orig_end != end) {
207 dev_info(&info->bridge->dev,
208 "host bridge window [%#llx-%#llx] "
209 "([%#llx-%#llx] ignored, not CPU addressable)\n",
210 start, orig_end, end + 1, orig_end);
211 }
169 212
170 res = &info->res[info->res_num]; 213 res = &info->res[info->res_num];
171 res->name = info->name; 214 res->name = info->name;
@@ -261,23 +304,20 @@ static void add_resources(struct pci_root_info *info)
261 "ignoring host bridge window %pR (conflicts with %s %pR)\n", 304 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
262 res, conflict->name, conflict); 305 res, conflict->name, conflict);
263 else 306 else
264 pci_bus_add_resource(info->bus, res, 0); 307 pci_add_resource(info->resources, res);
265 } 308 }
266} 309}
267 310
268static void 311static void
269get_current_resources(struct acpi_device *device, int busnum, 312get_current_resources(struct acpi_device *device, int busnum,
270 int domain, struct pci_bus *bus) 313 int domain, struct list_head *resources)
271{ 314{
272 struct pci_root_info info; 315 struct pci_root_info info;
273 size_t size; 316 size_t size;
274 317
275 if (pci_use_crs)
276 pci_bus_remove_resources(bus);
277
278 info.bridge = device; 318 info.bridge = device;
279 info.bus = bus;
280 info.res_num = 0; 319 info.res_num = 0;
320 info.resources = resources;
281 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 321 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
282 &info); 322 &info);
283 if (!info.res_num) 323 if (!info.res_num)
@@ -286,7 +326,7 @@ get_current_resources(struct acpi_device *device, int busnum,
286 size = sizeof(*info.res) * info.res_num; 326 size = sizeof(*info.res) * info.res_num;
287 info.res = kmalloc(size, GFP_KERNEL); 327 info.res = kmalloc(size, GFP_KERNEL);
288 if (!info.res) 328 if (!info.res)
289 goto res_alloc_fail; 329 return;
290 330
291 info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); 331 info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
292 if (!info.name) 332 if (!info.name)
@@ -301,8 +341,6 @@ get_current_resources(struct acpi_device *device, int busnum,
301 341
302name_alloc_fail: 342name_alloc_fail:
303 kfree(info.res); 343 kfree(info.res);
304res_alloc_fail:
305 return;
306} 344}
307 345
308struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) 346struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -310,6 +348,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
310 struct acpi_device *device = root->device; 348 struct acpi_device *device = root->device;
311 int domain = root->segment; 349 int domain = root->segment;
312 int busnum = root->secondary.start; 350 int busnum = root->secondary.start;
351 LIST_HEAD(resources);
313 struct pci_bus *bus; 352 struct pci_bus *bus;
314 struct pci_sysdata *sd; 353 struct pci_sysdata *sd;
315 int node; 354 int node;
@@ -364,11 +403,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
364 memcpy(bus->sysdata, sd, sizeof(*sd)); 403 memcpy(bus->sysdata, sd, sizeof(*sd));
365 kfree(sd); 404 kfree(sd);
366 } else { 405 } else {
367 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); 406 get_current_resources(device, busnum, domain, &resources);
368 if (bus) { 407 if (list_empty(&resources))
369 get_current_resources(device, busnum, domain, bus); 408 x86_pci_root_bus_resources(busnum, &resources);
409 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
410 &resources);
411 if (bus)
370 bus->subordinate = pci_scan_child_bus(bus); 412 bus->subordinate = pci_scan_child_bus(bus);
371 } 413 else
414 pci_free_resource_list(&resources);
372 } 415 }
373 416
374 /* After the PCI-E bus has been walked and all devices discovered, 417 /* After the PCI-E bus has been walked and all devices discovered,
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 026e4931d162..0567df3890e1 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = {
30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, 30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
31}; 31};
32 32
33static u64 __initdata fam10h_mmconf_start;
34static u64 __initdata fam10h_mmconf_end;
35static void __init get_pci_mmcfg_amd_fam10h_range(void)
36{
37 u32 address;
38 u64 base, msr;
39 unsigned segn_busn_bits;
40
41 /* assume all cpus from fam10h have mmconf */
42 if (boot_cpu_data.x86 < 0x10)
43 return;
44
45 address = MSR_FAM10H_MMIO_CONF_BASE;
46 rdmsrl(address, msr);
47
48 /* mmconfig is not enable */
49 if (!(msr & FAM10H_MMIO_CONF_ENABLE))
50 return;
51
52 base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
53
54 segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
55 FAM10H_MMIO_CONF_BUSRANGE_MASK;
56
57 fam10h_mmconf_start = base;
58 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
59}
60
61#define RANGE_NUM 16 33#define RANGE_NUM 16
62 34
63/** 35/**
@@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void)
85 u64 val; 57 u64 val;
86 u32 address; 58 u32 address;
87 bool found; 59 bool found;
60 struct resource fam10h_mmconf_res, *fam10h_mmconf;
61 u64 fam10h_mmconf_start;
62 u64 fam10h_mmconf_end;
88 63
89 if (!early_pci_allowed()) 64 if (!early_pci_allowed())
90 return -1; 65 return -1;
@@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void)
211 subtract_range(range, RANGE_NUM, 0, end); 186 subtract_range(range, RANGE_NUM, 0, end);
212 187
213 /* get mmconfig */ 188 /* get mmconfig */
214 get_pci_mmcfg_amd_fam10h_range(); 189 fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res);
215 /* need to take out mmconf range */ 190 /* need to take out mmconf range */
216 if (fam10h_mmconf_end) { 191 if (fam10h_mmconf) {
217 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); 192 printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf);
193 fam10h_mmconf_start = fam10h_mmconf->start;
194 fam10h_mmconf_end = fam10h_mmconf->end;
218 subtract_range(range, RANGE_NUM, fam10h_mmconf_start, 195 subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
219 fam10h_mmconf_end + 1); 196 fam10h_mmconf_end + 1);
197 } else {
198 fam10h_mmconf_start = 0;
199 fam10h_mmconf_end = 0;
220 } 200 }
221 201
222 /* mmio resource */ 202 /* mmio resource */
@@ -403,7 +383,6 @@ static void __init pci_enable_pci_io_ecs(void)
403 ++n; 383 ++n;
404 } 384 }
405 } 385 }
406 pr_info("Extended Config Space enabled on %u nodes\n", n);
407#endif 386#endif
408} 387}
409 388
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index ab8269b0da29..f3a7c569a403 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -15,10 +15,11 @@
15#include <linux/pci.h> 15#include <linux/pci.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <asm/pci_x86.h> 17#include <asm/pci_x86.h>
18#include <asm/pci-direct.h>
18 19
19#include "bus_numa.h" 20#include "bus_numa.h"
20 21
21static void __devinit cnb20le_res(struct pci_dev *dev) 22static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
22{ 23{
23 struct pci_root_info *info; 24 struct pci_root_info *info;
24 struct resource res; 25 struct resource res;
@@ -26,21 +27,12 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
26 u8 fbus, lbus; 27 u8 fbus, lbus;
27 int i; 28 int i;
28 29
29#ifdef CONFIG_ACPI
30 /*
31 * We should get host bridge information from ACPI unless the BIOS
32 * doesn't support it.
33 */
34 if (acpi_os_get_root_pointer())
35 return;
36#endif
37
38 info = &pci_root_info[pci_root_num]; 30 info = &pci_root_info[pci_root_num];
39 pci_root_num++; 31 pci_root_num++;
40 32
41 /* read the PCI bus numbers */ 33 /* read the PCI bus numbers */
42 pci_read_config_byte(dev, 0x44, &fbus); 34 fbus = read_pci_config_byte(bus, slot, func, 0x44);
43 pci_read_config_byte(dev, 0x45, &lbus); 35 lbus = read_pci_config_byte(bus, slot, func, 0x45);
44 info->bus_min = fbus; 36 info->bus_min = fbus;
45 info->bus_max = lbus; 37 info->bus_max = lbus;
46 38
@@ -59,8 +51,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
59 } 51 }
60 52
61 /* read the non-prefetchable memory window */ 53 /* read the non-prefetchable memory window */
62 pci_read_config_word(dev, 0xc0, &word1); 54 word1 = read_pci_config_16(bus, slot, func, 0xc0);
63 pci_read_config_word(dev, 0xc2, &word2); 55 word2 = read_pci_config_16(bus, slot, func, 0xc2);
64 if (word1 != word2) { 56 if (word1 != word2) {
65 res.start = (word1 << 16) | 0x0000; 57 res.start = (word1 << 16) | 0x0000;
66 res.end = (word2 << 16) | 0xffff; 58 res.end = (word2 << 16) | 0xffff;
@@ -69,8 +61,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
69 } 61 }
70 62
71 /* read the prefetchable memory window */ 63 /* read the prefetchable memory window */
72 pci_read_config_word(dev, 0xc4, &word1); 64 word1 = read_pci_config_16(bus, slot, func, 0xc4);
73 pci_read_config_word(dev, 0xc6, &word2); 65 word2 = read_pci_config_16(bus, slot, func, 0xc6);
74 if (word1 != word2) { 66 if (word1 != word2) {
75 res.start = (word1 << 16) | 0x0000; 67 res.start = (word1 << 16) | 0x0000;
76 res.end = (word2 << 16) | 0xffff; 68 res.end = (word2 << 16) | 0xffff;
@@ -79,8 +71,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
79 } 71 }
80 72
81 /* read the IO port window */ 73 /* read the IO port window */
82 pci_read_config_word(dev, 0xd0, &word1); 74 word1 = read_pci_config_16(bus, slot, func, 0xd0);
83 pci_read_config_word(dev, 0xd2, &word2); 75 word2 = read_pci_config_16(bus, slot, func, 0xd2);
84 if (word1 != word2) { 76 if (word1 != word2) {
85 res.start = word1; 77 res.start = word1;
86 res.end = word2; 78 res.end = word2;
@@ -92,13 +84,37 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
92 res.start = fbus; 84 res.start = fbus;
93 res.end = lbus; 85 res.end = lbus;
94 res.flags = IORESOURCE_BUS; 86 res.flags = IORESOURCE_BUS;
95 dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n", 87 printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res);
96 pci_domain_nr(dev->bus), &res);
97 88
98 for (i = 0; i < info->res_num; i++) 89 for (i = 0; i < info->res_num; i++)
99 dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]); 90 printk(KERN_INFO "host bridge window %pR\n", &info->res[i]);
100} 91}
101 92
102DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE, 93static int __init broadcom_postcore_init(void)
103 cnb20le_res); 94{
95 u8 bus = 0, slot = 0;
96 u32 id;
97 u16 vendor, device;
98
99#ifdef CONFIG_ACPI
100 /*
101 * We should get host bridge information from ACPI unless the BIOS
102 * doesn't support it.
103 */
104 if (acpi_os_get_root_pointer())
105 return 0;
106#endif
107
108 id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
109 vendor = id & 0xffff;
110 device = (id >> 16) & 0xffff;
111
112 if (vendor == PCI_VENDOR_ID_SERVERWORKS &&
113 device == PCI_DEVICE_ID_SERVERWORKS_LE) {
114 cnb20le_res(bus, slot, 0);
115 cnb20le_res(bus, slot, 1);
116 }
117 return 0;
118}
104 119
120postcore_initcall(broadcom_postcore_init);
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 64a122883896..fd3f65510e9d 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -7,45 +7,50 @@
7int pci_root_num; 7int pci_root_num;
8struct pci_root_info pci_root_info[PCI_ROOT_NR]; 8struct pci_root_info pci_root_info[PCI_ROOT_NR];
9 9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b) 10void x86_pci_root_bus_resources(int bus, struct list_head *resources)
11{ 11{
12 int i; 12 int i;
13 int j; 13 int j;
14 struct pci_root_info *info; 14 struct pci_root_info *info;
15 15
16 /* don't go for it if _CRS is used already */
17 if (b->resource[0] != &ioport_resource ||
18 b->resource[1] != &iomem_resource)
19 return;
20
21 if (!pci_root_num) 16 if (!pci_root_num)
22 return; 17 goto default_resources;
23 18
24 for (i = 0; i < pci_root_num; i++) { 19 for (i = 0; i < pci_root_num; i++) {
25 if (pci_root_info[i].bus_min == b->number) 20 if (pci_root_info[i].bus_min == bus)
26 break; 21 break;
27 } 22 }
28 23
29 if (i == pci_root_num) 24 if (i == pci_root_num)
30 return; 25 goto default_resources;
31 26
32 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", 27 printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
33 b->number); 28 bus);
34 29
35 pci_bus_remove_resources(b);
36 info = &pci_root_info[i]; 30 info = &pci_root_info[i];
37 for (j = 0; j < info->res_num; j++) { 31 for (j = 0; j < info->res_num; j++) {
38 struct resource *res; 32 struct resource *res;
39 struct resource *root; 33 struct resource *root;
40 34
41 res = &info->res[j]; 35 res = &info->res[j];
42 pci_bus_add_resource(b, res, 0); 36 pci_add_resource(resources, res);
43 if (res->flags & IORESOURCE_IO) 37 if (res->flags & IORESOURCE_IO)
44 root = &ioport_resource; 38 root = &ioport_resource;
45 else 39 else
46 root = &iomem_resource; 40 root = &iomem_resource;
47 insert_resource(root, res); 41 insert_resource(root, res);
48 } 42 }
43 return;
44
45default_resources:
46 /*
47 * We don't have any host bridge aperture information from the
48 * "native host bridge drivers," e.g., amd_bus or broadcom_bus,
49 * so fall back to the defaults historically used by pci_create_bus().
50 */
51 printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus);
52 pci_add_resource(resources, &ioport_resource);
53 pci_add_resource(resources, &iomem_resource);
49} 54}
50 55
51void __devinit update_res(struct pci_root_info *info, resource_size_t start, 56void __devinit update_res(struct pci_root_info *info, resource_size_t start,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7962ccb4d9b2..323481e06ef8 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -164,9 +164,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
164{ 164{
165 struct pci_dev *dev; 165 struct pci_dev *dev;
166 166
167 /* root bus? */
168 if (!b->parent)
169 x86_pci_root_bus_res_quirks(b);
170 pci_read_bridge_bases(b); 167 pci_read_bridge_bases(b);
171 list_for_each_entry(dev, &b->devices, bus_list) 168 list_for_each_entry(dev, &b->devices, bus_list)
172 pcibios_fixup_device_resources(dev); 169 pcibios_fixup_device_resources(dev);
@@ -433,6 +430,7 @@ void __init dmi_check_pciprobe(void)
433 430
434struct pci_bus * __devinit pcibios_scan_root(int busnum) 431struct pci_bus * __devinit pcibios_scan_root(int busnum)
435{ 432{
433 LIST_HEAD(resources);
436 struct pci_bus *bus = NULL; 434 struct pci_bus *bus = NULL;
437 struct pci_sysdata *sd; 435 struct pci_sysdata *sd;
438 436
@@ -456,9 +454,12 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
456 sd->node = get_mp_bus_to_node(busnum); 454 sd->node = get_mp_bus_to_node(busnum);
457 455
458 printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); 456 printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
459 bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); 457 x86_pci_root_bus_resources(busnum, &resources);
460 if (!bus) 458 bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources);
459 if (!bus) {
460 pci_free_resource_list(&resources);
461 kfree(sd); 461 kfree(sd);
462 }
462 463
463 return bus; 464 return bus;
464} 465}
@@ -639,6 +640,7 @@ int pci_ext_cfg_avail(struct pci_dev *dev)
639 640
640struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) 641struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
641{ 642{
643 LIST_HEAD(resources);
642 struct pci_bus *bus = NULL; 644 struct pci_bus *bus = NULL;
643 struct pci_sysdata *sd; 645 struct pci_sysdata *sd;
644 646
@@ -653,9 +655,12 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops,
653 return NULL; 655 return NULL;
654 } 656 }
655 sd->node = node; 657 sd->node = node;
656 bus = pci_scan_bus(busno, ops, sd); 658 x86_pci_root_bus_resources(busno, &resources);
657 if (!bus) 659 bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources);
660 if (!bus) {
661 pci_free_resource_list(&resources);
658 kfree(sd); 662 kfree(sd);
663 }
659 664
660 return bus; 665 return bus;
661} 666}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 794b092d01ae..91821a1a0c3a 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -254,26 +254,6 @@ void __init pcibios_resource_survey(void)
254 */ 254 */
255fs_initcall(pcibios_assign_resources); 255fs_initcall(pcibios_assign_resources);
256 256
257/*
258 * If we set up a device for bus mastering, we need to check the latency
259 * timer as certain crappy BIOSes forget to set it properly.
260 */
261unsigned int pcibios_max_latency = 255;
262
263void pcibios_set_master(struct pci_dev *dev)
264{
265 u8 lat;
266 pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
267 if (lat < 16)
268 lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
269 else if (lat > pcibios_max_latency)
270 lat = pcibios_max_latency;
271 else
272 return;
273 dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
274 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
275}
276
277static const struct vm_operations_struct pci_mmap_ops = { 257static const struct vm_operations_struct pci_mmap_ops = {
278 .access = generic_access_phys, 258 .access = generic_access_phys,
279}; 259};
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 2c2aeabc2609..a1df191129d3 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -31,9 +31,6 @@ int __init pci_legacy_init(void)
31 31
32 printk("PCI: Probing PCI hardware\n"); 32 printk("PCI: Probing PCI hardware\n");
33 pci_root_bus = pcibios_scan_root(0); 33 pci_root_bus = pcibios_scan_root(0);
34 if (pci_root_bus)
35 pci_bus_add_devices(pci_root_bus);
36
37 return 0; 34 return 0;
38} 35}
39 36
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 51abf02f9226..83e125b95ca6 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -153,8 +153,6 @@ int __init pci_numaq_init(void)
153 raw_pci_ops = &pci_direct_conf1_mq; 153 raw_pci_ops = &pci_direct_conf1_mq;
154 154
155 pci_root_bus = pcibios_scan_root(0); 155 pci_root_bus = pcibios_scan_root(0);
156 if (pci_root_bus)
157 pci_bus_add_devices(pci_root_bus);
158 if (num_online_nodes() > 1) 156 if (num_online_nodes() > 1)
159 for_each_online_node(quad) { 157 for_each_online_node(quad) {
160 if (quad == 0) 158 if (quad == 0)
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index db0e9a51e611..da8fe0535ff4 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -44,7 +44,7 @@ static inline void set_bios_x(void)
44 pcibios_enabled = 1; 44 pcibios_enabled = 1;
45 set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); 45 set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
46 if (__supported_pte_mask & _PAGE_NX) 46 if (__supported_pte_mask & _PAGE_NX)
47 printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n"); 47 printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n");
48} 48}
49 49
50/* 50/*
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index ca1973699d3d..dc5f1d32aced 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -27,7 +27,7 @@
27 27
28#include <asm/geode.h> 28#include <asm/geode.h>
29 29
30static int force = 0; 30static bool force = 0;
31module_param(force, bool, 0444); 31module_param(force, bool, 0444);
32/* FIXME: Award bios is not automatically detected as Alix platform */ 32/* FIXME: Award bios is not automatically detected as Alix platform */
33MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform"); 33MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 1ba7f5ed8c9b..5917eb56b313 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
42MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); 42MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
43MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); 43MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
44 44
45static int force; 45static bool force;
46 46
47module_param(force, bool, 0); 47module_param(force, bool, 0);
48MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation."); 48MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index 1ea38775a6d3..7baed5135e0f 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_X86_MRST) += mrst.o 1obj-$(CONFIG_X86_INTEL_MID) += mrst.o
2obj-$(CONFIG_X86_MRST) += vrtc.o 2obj-$(CONFIG_X86_INTEL_MID) += vrtc.o
3obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o 3obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o
4obj-$(CONFIG_X86_MRST) += pmu.o 4obj-$(CONFIG_X86_MRST) += pmu.o
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 25bfdbb5b130..3c6e328483c7 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -245,16 +245,24 @@ struct console early_mrst_console = {
245 * Following is the early console based on Medfield HSU (High 245 * Following is the early console based on Medfield HSU (High
246 * Speed UART) device. 246 * Speed UART) device.
247 */ 247 */
248#define HSU_PORT2_PADDR 0xffa28180 248#define HSU_PORT_BASE 0xffa28080
249 249
250static void __iomem *phsu; 250static void __iomem *phsu;
251 251
252void hsu_early_console_init(void) 252void hsu_early_console_init(const char *s)
253{ 253{
254 unsigned long paddr, port = 0;
254 u8 lcr; 255 u8 lcr;
255 256
256 phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, 257 /*
257 HSU_PORT2_PADDR); 258 * Select the early HSU console port if specified by user in the
259 * kernel command line.
260 */
261 if (*s && !kstrtoul(s, 10, &port))
262 port = clamp_val(port, 0, 2);
263
264 paddr = HSU_PORT_BASE + port * 0x80;
265 phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
258 266
259 /* Disable FIFO */ 267 /* Disable FIFO */
260 writeb(0x0, phsu + UART_FCR); 268 writeb(0x0, phsu + UART_FCR);
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index ad4ec1cb097e..475e2cd0f3c3 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -848,8 +848,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry)
848 if (mrst_has_msic()) 848 if (mrst_has_msic())
849 return; 849 return;
850 850
851 /* ID as IRQ is a hack that will go away */ 851 pdev = platform_device_alloc(entry->name, 0);
852 pdev = platform_device_alloc(entry->name, entry->irq);
853 if (pdev == NULL) { 852 if (pdev == NULL) {
854 pr_err("out of memory for SFI platform device '%s'.\n", 853 pr_err("out of memory for SFI platform device '%s'.\n",
855 entry->name); 854 entry->name);
@@ -1030,6 +1029,7 @@ static int __init pb_keys_init(void)
1030 num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); 1029 num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
1031 for (i = 0; i < num; i++) { 1030 for (i = 0; i < num; i++) {
1032 gb[i].gpio = get_gpio_by_name(gb[i].desc); 1031 gb[i].gpio = get_gpio_by_name(gb[i].desc);
1032 pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio);
1033 if (gb[i].gpio == -1) 1033 if (gb[i].gpio == -1)
1034 continue; 1034 continue;
1035 1035
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 309c70fb7759..5d4ba301e776 100644
--- a/arch/x86/platform/uv/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
@@ -19,7 +19,7 @@
19 * Copyright (c) Russ Anderson 19 * Copyright (c) Russ Anderson
20 */ 20 */
21 21
22#include <linux/sysdev.h> 22#include <linux/device.h>
23#include <asm/uv/bios.h> 23#include <asm/uv/bios.h>
24#include <asm/uv/uv.h> 24#include <asm/uv/uv.h>
25 25
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
new file mode 100644
index 000000000000..564b2476fede
--- /dev/null
+++ b/arch/x86/syscalls/Makefile
@@ -0,0 +1,43 @@
1out := $(obj)/../include/generated/asm
2
3# Create output directory if not already present
4_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
5
6syscall32 := $(srctree)/$(src)/syscall_32.tbl
7syscall64 := $(srctree)/$(src)/syscall_64.tbl
8
9syshdr := $(srctree)/$(src)/syscallhdr.sh
10systbl := $(srctree)/$(src)/syscalltbl.sh
11
12quiet_cmd_syshdr = SYSHDR $@
13 cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \
14 $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget))
15quiet_cmd_systbl = SYSTBL $@
16 cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
17
18syshdr_abi_unistd_32 := i386
19$(out)/unistd_32.h: $(syscall32) $(syshdr)
20 $(call if_changed,syshdr)
21
22syshdr_abi_unistd_32_ia32 := i386
23syshdr_pfx_unistd_32_ia32 := ia32_
24$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
25 $(call if_changed,syshdr)
26
27syshdr_abi_unistd_64 := 64
28$(out)/unistd_64.h: $(syscall64) $(syshdr)
29 $(call if_changed,syshdr)
30
31$(out)/syscalls_32.h: $(syscall32) $(systbl)
32 $(call if_changed,systbl)
33$(out)/syscalls_64.h: $(syscall64) $(systbl)
34 $(call if_changed,systbl)
35
36syshdr-y += unistd_32.h unistd_64.h
37syshdr-y += syscalls_32.h
38syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h
39syshdr-$(CONFIG_X86_64) += syscalls_64.h
40
41targets += $(syshdr-y)
42
43all: $(addprefix $(out)/,$(targets))
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
new file mode 100644
index 000000000000..ce98e287c066
--- /dev/null
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -0,0 +1,357 @@
1#
2# 32-bit system call numbers and entry vectors
3#
4# The format is:
5# <number> <abi> <name> <entry point> <compat entry point>
6#
7# The abi is always "i386" for this file.
8#
90 i386 restart_syscall sys_restart_syscall
101 i386 exit sys_exit
112 i386 fork ptregs_fork stub32_fork
123 i386 read sys_read
134 i386 write sys_write
145 i386 open sys_open compat_sys_open
156 i386 close sys_close
167 i386 waitpid sys_waitpid sys32_waitpid
178 i386 creat sys_creat
189 i386 link sys_link
1910 i386 unlink sys_unlink
2011 i386 execve ptregs_execve stub32_execve
2112 i386 chdir sys_chdir
2213 i386 time sys_time compat_sys_time
2314 i386 mknod sys_mknod
2415 i386 chmod sys_chmod
2516 i386 lchown sys_lchown16
2617 i386 break
2718 i386 oldstat sys_stat
2819 i386 lseek sys_lseek sys32_lseek
2920 i386 getpid sys_getpid
3021 i386 mount sys_mount compat_sys_mount
3122 i386 umount sys_oldumount
3223 i386 setuid sys_setuid16
3324 i386 getuid sys_getuid16
3425 i386 stime sys_stime compat_sys_stime
3526 i386 ptrace sys_ptrace compat_sys_ptrace
3627 i386 alarm sys_alarm
3728 i386 oldfstat sys_fstat
3829 i386 pause sys_pause
3930 i386 utime sys_utime compat_sys_utime
4031 i386 stty
4132 i386 gtty
4233 i386 access sys_access
4334 i386 nice sys_nice
4435 i386 ftime
4536 i386 sync sys_sync
4637 i386 kill sys_kill sys32_kill
4738 i386 rename sys_rename
4839 i386 mkdir sys_mkdir
4940 i386 rmdir sys_rmdir
5041 i386 dup sys_dup
5142 i386 pipe sys_pipe
5243 i386 times sys_times compat_sys_times
5344 i386 prof
5445 i386 brk sys_brk
5546 i386 setgid sys_setgid16
5647 i386 getgid sys_getgid16
5748 i386 signal sys_signal
5849 i386 geteuid sys_geteuid16
5950 i386 getegid sys_getegid16
6051 i386 acct sys_acct
6152 i386 umount2 sys_umount
6253 i386 lock
6354 i386 ioctl sys_ioctl compat_sys_ioctl
6455 i386 fcntl sys_fcntl compat_sys_fcntl64
6556 i386 mpx
6657 i386 setpgid sys_setpgid
6758 i386 ulimit
6859 i386 oldolduname sys_olduname
6960 i386 umask sys_umask
7061 i386 chroot sys_chroot
7162 i386 ustat sys_ustat compat_sys_ustat
7263 i386 dup2 sys_dup2
7364 i386 getppid sys_getppid
7465 i386 getpgrp sys_getpgrp
7566 i386 setsid sys_setsid
7667 i386 sigaction sys_sigaction sys32_sigaction
7768 i386 sgetmask sys_sgetmask
7869 i386 ssetmask sys_ssetmask
7970 i386 setreuid sys_setreuid16
8071 i386 setregid sys_setregid16
8172 i386 sigsuspend sys_sigsuspend sys32_sigsuspend
8273 i386 sigpending sys_sigpending compat_sys_sigpending
8374 i386 sethostname sys_sethostname
8475 i386 setrlimit sys_setrlimit compat_sys_setrlimit
8576 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit
8677 i386 getrusage sys_getrusage compat_sys_getrusage
8778 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday
8879 i386 settimeofday sys_settimeofday compat_sys_settimeofday
8980 i386 getgroups sys_getgroups16
9081 i386 setgroups sys_setgroups16
9182 i386 select sys_old_select compat_sys_old_select
9283 i386 symlink sys_symlink
9384 i386 oldlstat sys_lstat
9485 i386 readlink sys_readlink
9586 i386 uselib sys_uselib
9687 i386 swapon sys_swapon
9788 i386 reboot sys_reboot
9889 i386 readdir sys_old_readdir compat_sys_old_readdir
9990 i386 mmap sys_old_mmap sys32_mmap
10091 i386 munmap sys_munmap
10192 i386 truncate sys_truncate
10293 i386 ftruncate sys_ftruncate
10394 i386 fchmod sys_fchmod
10495 i386 fchown sys_fchown16
10596 i386 getpriority sys_getpriority
10697 i386 setpriority sys_setpriority
10798 i386 profil
10899 i386 statfs sys_statfs compat_sys_statfs
109100 i386 fstatfs sys_fstatfs compat_sys_fstatfs
110101 i386 ioperm sys_ioperm
111102 i386 socketcall sys_socketcall compat_sys_socketcall
112103 i386 syslog sys_syslog
113104 i386 setitimer sys_setitimer compat_sys_setitimer
114105 i386 getitimer sys_getitimer compat_sys_getitimer
115106 i386 stat sys_newstat compat_sys_newstat
116107 i386 lstat sys_newlstat compat_sys_newlstat
117108 i386 fstat sys_newfstat compat_sys_newfstat
118109 i386 olduname sys_uname
119110 i386 iopl ptregs_iopl stub32_iopl
120111 i386 vhangup sys_vhangup
121112 i386 idle
122113 i386 vm86old ptregs_vm86old sys32_vm86_warning
123114 i386 wait4 sys_wait4 compat_sys_wait4
124115 i386 swapoff sys_swapoff
125116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
126117 i386 ipc sys_ipc sys32_ipc
127118 i386 fsync sys_fsync
128119 i386 sigreturn ptregs_sigreturn stub32_sigreturn
129120 i386 clone ptregs_clone stub32_clone
130121 i386 setdomainname sys_setdomainname
131122 i386 uname sys_newuname
132123 i386 modify_ldt sys_modify_ldt
133124 i386 adjtimex sys_adjtimex compat_sys_adjtimex
134125 i386 mprotect sys_mprotect sys32_mprotect
135126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask
136127 i386 create_module
137128 i386 init_module sys_init_module
138129 i386 delete_module sys_delete_module
139130 i386 get_kernel_syms
140131 i386 quotactl sys_quotactl sys32_quotactl
141132 i386 getpgid sys_getpgid
142133 i386 fchdir sys_fchdir
143134 i386 bdflush sys_bdflush
144135 i386 sysfs sys_sysfs
145136 i386 personality sys_personality
146137 i386 afs_syscall
147138 i386 setfsuid sys_setfsuid16
148139 i386 setfsgid sys_setfsgid16
149140 i386 _llseek sys_llseek
150141 i386 getdents sys_getdents compat_sys_getdents
151142 i386 _newselect sys_select compat_sys_select
152143 i386 flock sys_flock
153144 i386 msync sys_msync
154145 i386 readv sys_readv compat_sys_readv
155146 i386 writev sys_writev compat_sys_writev
156147 i386 getsid sys_getsid
157148 i386 fdatasync sys_fdatasync
158149 i386 _sysctl sys_sysctl compat_sys_sysctl
159150 i386 mlock sys_mlock
160151 i386 munlock sys_munlock
161152 i386 mlockall sys_mlockall
162153 i386 munlockall sys_munlockall
163154 i386 sched_setparam sys_sched_setparam
164155 i386 sched_getparam sys_sched_getparam
165156 i386 sched_setscheduler sys_sched_setscheduler
166157 i386 sched_getscheduler sys_sched_getscheduler
167158 i386 sched_yield sys_sched_yield
168159 i386 sched_get_priority_max sys_sched_get_priority_max
169160 i386 sched_get_priority_min sys_sched_get_priority_min
170161 i386 sched_rr_get_interval sys_sched_rr_get_interval sys32_sched_rr_get_interval
171162 i386 nanosleep sys_nanosleep compat_sys_nanosleep
172163 i386 mremap sys_mremap
173164 i386 setresuid sys_setresuid16
174165 i386 getresuid sys_getresuid16
175166 i386 vm86 ptregs_vm86 sys32_vm86_warning
176167 i386 query_module
177168 i386 poll sys_poll
178169 i386 nfsservctl
179170 i386 setresgid sys_setresgid16
180171 i386 getresgid sys_getresgid16
181172 i386 prctl sys_prctl
182173 i386 rt_sigreturn ptregs_rt_sigreturn stub32_rt_sigreturn
183174 i386 rt_sigaction sys_rt_sigaction sys32_rt_sigaction
184175 i386 rt_sigprocmask sys_rt_sigprocmask sys32_rt_sigprocmask
185176 i386 rt_sigpending sys_rt_sigpending sys32_rt_sigpending
186177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait
187178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo sys32_rt_sigqueueinfo
188179 i386 rt_sigsuspend sys_rt_sigsuspend
189180 i386 pread64 sys_pread64 sys32_pread
190181 i386 pwrite64 sys_pwrite64 sys32_pwrite
191182 i386 chown sys_chown16
192183 i386 getcwd sys_getcwd
193184 i386 capget sys_capget
194185 i386 capset sys_capset
195186 i386 sigaltstack ptregs_sigaltstack stub32_sigaltstack
196187 i386 sendfile sys_sendfile sys32_sendfile
197188 i386 getpmsg
198189 i386 putpmsg
199190 i386 vfork ptregs_vfork stub32_vfork
200191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit
201192 i386 mmap2 sys_mmap_pgoff
202193 i386 truncate64 sys_truncate64 sys32_truncate64
203194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64
204195 i386 stat64 sys_stat64 sys32_stat64
205196 i386 lstat64 sys_lstat64 sys32_lstat64
206197 i386 fstat64 sys_fstat64 sys32_fstat64
207198 i386 lchown32 sys_lchown
208199 i386 getuid32 sys_getuid
209200 i386 getgid32 sys_getgid
210201 i386 geteuid32 sys_geteuid
211202 i386 getegid32 sys_getegid
212203 i386 setreuid32 sys_setreuid
213204 i386 setregid32 sys_setregid
214205 i386 getgroups32 sys_getgroups
215206 i386 setgroups32 sys_setgroups
216207 i386 fchown32 sys_fchown
217208 i386 setresuid32 sys_setresuid
218209 i386 getresuid32 sys_getresuid
219210 i386 setresgid32 sys_setresgid
220211 i386 getresgid32 sys_getresgid
221212 i386 chown32 sys_chown
222213 i386 setuid32 sys_setuid
223214 i386 setgid32 sys_setgid
224215 i386 setfsuid32 sys_setfsuid
225216 i386 setfsgid32 sys_setfsgid
226217 i386 pivot_root sys_pivot_root
227218 i386 mincore sys_mincore
228219 i386 madvise sys_madvise
229220 i386 getdents64 sys_getdents64 compat_sys_getdents64
230221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64
231# 222 is unused
232# 223 is unused
233224 i386 gettid sys_gettid
234225 i386 readahead sys_readahead sys32_readahead
235226 i386 setxattr sys_setxattr
236227 i386 lsetxattr sys_lsetxattr
237228 i386 fsetxattr sys_fsetxattr
238229 i386 getxattr sys_getxattr
239230 i386 lgetxattr sys_lgetxattr
240231 i386 fgetxattr sys_fgetxattr
241232 i386 listxattr sys_listxattr
242233 i386 llistxattr sys_llistxattr
243234 i386 flistxattr sys_flistxattr
244235 i386 removexattr sys_removexattr
245236 i386 lremovexattr sys_lremovexattr
246237 i386 fremovexattr sys_fremovexattr
247238 i386 tkill sys_tkill
248239 i386 sendfile64 sys_sendfile64
249240 i386 futex sys_futex compat_sys_futex
250241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity
251242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity
252243 i386 set_thread_area sys_set_thread_area
253244 i386 get_thread_area sys_get_thread_area
254245 i386 io_setup sys_io_setup compat_sys_io_setup
255246 i386 io_destroy sys_io_destroy
256247 i386 io_getevents sys_io_getevents compat_sys_io_getevents
257248 i386 io_submit sys_io_submit compat_sys_io_submit
258249 i386 io_cancel sys_io_cancel
259250 i386 fadvise64 sys_fadvise64 sys32_fadvise64
260# 251 is available for reuse (was briefly sys_set_zone_reclaim)
261252 i386 exit_group sys_exit_group
262253 i386 lookup_dcookie sys_lookup_dcookie sys32_lookup_dcookie
263254 i386 epoll_create sys_epoll_create
264255 i386 epoll_ctl sys_epoll_ctl
265256 i386 epoll_wait sys_epoll_wait
266257 i386 remap_file_pages sys_remap_file_pages
267258 i386 set_tid_address sys_set_tid_address
268259 i386 timer_create sys_timer_create compat_sys_timer_create
269260 i386 timer_settime sys_timer_settime compat_sys_timer_settime
270261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime
271262 i386 timer_getoverrun sys_timer_getoverrun
272263 i386 timer_delete sys_timer_delete
273264 i386 clock_settime sys_clock_settime compat_sys_clock_settime
274265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime
275266 i386 clock_getres sys_clock_getres compat_sys_clock_getres
276267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep
277268 i386 statfs64 sys_statfs64 compat_sys_statfs64
278269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
279270 i386 tgkill sys_tgkill
280271 i386 utimes sys_utimes compat_sys_utimes
281272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64
282273 i386 vserver
283274 i386 mbind sys_mbind
284275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
285276 i386 set_mempolicy sys_set_mempolicy
286277 i386 mq_open sys_mq_open compat_sys_mq_open
287278 i386 mq_unlink sys_mq_unlink
288279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend
289280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive
290281 i386 mq_notify sys_mq_notify compat_sys_mq_notify
291282 i386 mq_getsetaddr sys_mq_getsetattr compat_sys_mq_getsetattr
292283 i386 kexec_load sys_kexec_load compat_sys_kexec_load
293284 i386 waitid sys_waitid compat_sys_waitid
294# 285 sys_setaltroot
295286 i386 add_key sys_add_key
296287 i386 request_key sys_request_key
297288 i386 keyctl sys_keyctl
298289 i386 ioprio_set sys_ioprio_set
299290 i386 ioprio_get sys_ioprio_get
300291 i386 inotify_init sys_inotify_init
301292 i386 inotify_add_watch sys_inotify_add_watch
302293 i386 inotify_rm_watch sys_inotify_rm_watch
303294 i386 migrate_pages sys_migrate_pages
304295 i386 openat sys_openat compat_sys_openat
305296 i386 mkdirat sys_mkdirat
306297 i386 mknodat sys_mknodat
307298 i386 fchownat sys_fchownat
308299 i386 futimesat sys_futimesat compat_sys_futimesat
309300 i386 fstatat64 sys_fstatat64 sys32_fstatat
310301 i386 unlinkat sys_unlinkat
311302 i386 renameat sys_renameat
312303 i386 linkat sys_linkat
313304 i386 symlinkat sys_symlinkat
314305 i386 readlinkat sys_readlinkat
315306 i386 fchmodat sys_fchmodat
316307 i386 faccessat sys_faccessat
317308 i386 pselect6 sys_pselect6 compat_sys_pselect6
318309 i386 ppoll sys_ppoll compat_sys_ppoll
319310 i386 unshare sys_unshare
320311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list
321312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list
322313 i386 splice sys_splice
323314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range
324315 i386 tee sys_tee
325316 i386 vmsplice sys_vmsplice compat_sys_vmsplice
326317 i386 move_pages sys_move_pages compat_sys_move_pages
327318 i386 getcpu sys_getcpu
328319 i386 epoll_pwait sys_epoll_pwait
329320 i386 utimensat sys_utimensat compat_sys_utimensat
330321 i386 signalfd sys_signalfd compat_sys_signalfd
331322 i386 timerfd_create sys_timerfd_create
332323 i386 eventfd sys_eventfd
333324 i386 fallocate sys_fallocate sys32_fallocate
334325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime
335326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime
336327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4
337328 i386 eventfd2 sys_eventfd2
338329 i386 epoll_create1 sys_epoll_create1
339330 i386 dup3 sys_dup3
340331 i386 pipe2 sys_pipe2
341332 i386 inotify_init1 sys_inotify_init1
342333 i386 preadv sys_preadv compat_sys_preadv
343334 i386 pwritev sys_pwritev compat_sys_pwritev
344335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
345336 i386 perf_event_open sys_perf_event_open
346337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg
347338 i386 fanotify_init sys_fanotify_init
348339 i386 fanotify_mark sys_fanotify_mark sys32_fanotify_mark
349340 i386 prlimit64 sys_prlimit64
350341 i386 name_to_handle_at sys_name_to_handle_at
351342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at
352343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime
353344 i386 syncfs sys_syncfs
354345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg
355346 i386 setns sys_setns
356347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
new file mode 100644
index 000000000000..b440a8f7eefa
--- /dev/null
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -0,0 +1,320 @@
1#
2# 64-bit system call numbers and entry vectors
3#
4# The format is:
5# <number> <abi> <name> <entry point>
6#
7# The abi is always "64" for this file (for now.)
8#
90 64 read sys_read
101 64 write sys_write
112 64 open sys_open
123 64 close sys_close
134 64 stat sys_newstat
145 64 fstat sys_newfstat
156 64 lstat sys_newlstat
167 64 poll sys_poll
178 64 lseek sys_lseek
189 64 mmap sys_mmap
1910 64 mprotect sys_mprotect
2011 64 munmap sys_munmap
2112 64 brk sys_brk
2213 64 rt_sigaction sys_rt_sigaction
2314 64 rt_sigprocmask sys_rt_sigprocmask
2415 64 rt_sigreturn stub_rt_sigreturn
2516 64 ioctl sys_ioctl
2617 64 pread64 sys_pread64
2718 64 pwrite64 sys_pwrite64
2819 64 readv sys_readv
2920 64 writev sys_writev
3021 64 access sys_access
3122 64 pipe sys_pipe
3223 64 select sys_select
3324 64 sched_yield sys_sched_yield
3425 64 mremap sys_mremap
3526 64 msync sys_msync
3627 64 mincore sys_mincore
3728 64 madvise sys_madvise
3829 64 shmget sys_shmget
3930 64 shmat sys_shmat
4031 64 shmctl sys_shmctl
4132 64 dup sys_dup
4233 64 dup2 sys_dup2
4334 64 pause sys_pause
4435 64 nanosleep sys_nanosleep
4536 64 getitimer sys_getitimer
4637 64 alarm sys_alarm
4738 64 setitimer sys_setitimer
4839 64 getpid sys_getpid
4940 64 sendfile sys_sendfile64
5041 64 socket sys_socket
5142 64 connect sys_connect
5243 64 accept sys_accept
5344 64 sendto sys_sendto
5445 64 recvfrom sys_recvfrom
5546 64 sendmsg sys_sendmsg
5647 64 recvmsg sys_recvmsg
5748 64 shutdown sys_shutdown
5849 64 bind sys_bind
5950 64 listen sys_listen
6051 64 getsockname sys_getsockname
6152 64 getpeername sys_getpeername
6253 64 socketpair sys_socketpair
6354 64 setsockopt sys_setsockopt
6455 64 getsockopt sys_getsockopt
6556 64 clone stub_clone
6657 64 fork stub_fork
6758 64 vfork stub_vfork
6859 64 execve stub_execve
6960 64 exit sys_exit
7061 64 wait4 sys_wait4
7162 64 kill sys_kill
7263 64 uname sys_newuname
7364 64 semget sys_semget
7465 64 semop sys_semop
7566 64 semctl sys_semctl
7667 64 shmdt sys_shmdt
7768 64 msgget sys_msgget
7869 64 msgsnd sys_msgsnd
7970 64 msgrcv sys_msgrcv
8071 64 msgctl sys_msgctl
8172 64 fcntl sys_fcntl
8273 64 flock sys_flock
8374 64 fsync sys_fsync
8475 64 fdatasync sys_fdatasync
8576 64 truncate sys_truncate
8677 64 ftruncate sys_ftruncate
8778 64 getdents sys_getdents
8879 64 getcwd sys_getcwd
8980 64 chdir sys_chdir
9081 64 fchdir sys_fchdir
9182 64 rename sys_rename
9283 64 mkdir sys_mkdir
9384 64 rmdir sys_rmdir
9485 64 creat sys_creat
9586 64 link sys_link
9687 64 unlink sys_unlink
9788 64 symlink sys_symlink
9889 64 readlink sys_readlink
9990 64 chmod sys_chmod
10091 64 fchmod sys_fchmod
10192 64 chown sys_chown
10293 64 fchown sys_fchown
10394 64 lchown sys_lchown
10495 64 umask sys_umask
10596 64 gettimeofday sys_gettimeofday
10697 64 getrlimit sys_getrlimit
10798 64 getrusage sys_getrusage
10899 64 sysinfo sys_sysinfo
109100 64 times sys_times
110101 64 ptrace sys_ptrace
111102 64 getuid sys_getuid
112103 64 syslog sys_syslog
113104 64 getgid sys_getgid
114105 64 setuid sys_setuid
115106 64 setgid sys_setgid
116107 64 geteuid sys_geteuid
117108 64 getegid sys_getegid
118109 64 setpgid sys_setpgid
119110 64 getppid sys_getppid
120111 64 getpgrp sys_getpgrp
121112 64 setsid sys_setsid
122113 64 setreuid sys_setreuid
123114 64 setregid sys_setregid
124115 64 getgroups sys_getgroups
125116 64 setgroups sys_setgroups
126117 64 setresuid sys_setresuid
127118 64 getresuid sys_getresuid
128119 64 setresgid sys_setresgid
129120 64 getresgid sys_getresgid
130121 64 getpgid sys_getpgid
131122 64 setfsuid sys_setfsuid
132123 64 setfsgid sys_setfsgid
133124 64 getsid sys_getsid
134125 64 capget sys_capget
135126 64 capset sys_capset
136127 64 rt_sigpending sys_rt_sigpending
137128 64 rt_sigtimedwait sys_rt_sigtimedwait
138129 64 rt_sigqueueinfo sys_rt_sigqueueinfo
139130 64 rt_sigsuspend sys_rt_sigsuspend
140131 64 sigaltstack stub_sigaltstack
141132 64 utime sys_utime
142133 64 mknod sys_mknod
143134 64 uselib
144135 64 personality sys_personality
145136 64 ustat sys_ustat
146137 64 statfs sys_statfs
147138 64 fstatfs sys_fstatfs
148139 64 sysfs sys_sysfs
149140 64 getpriority sys_getpriority
150141 64 setpriority sys_setpriority
151142 64 sched_setparam sys_sched_setparam
152143 64 sched_getparam sys_sched_getparam
153144 64 sched_setscheduler sys_sched_setscheduler
154145 64 sched_getscheduler sys_sched_getscheduler
155146 64 sched_get_priority_max sys_sched_get_priority_max
156147 64 sched_get_priority_min sys_sched_get_priority_min
157148 64 sched_rr_get_interval sys_sched_rr_get_interval
158149 64 mlock sys_mlock
159150 64 munlock sys_munlock
160151 64 mlockall sys_mlockall
161152 64 munlockall sys_munlockall
162153 64 vhangup sys_vhangup
163154 64 modify_ldt sys_modify_ldt
164155 64 pivot_root sys_pivot_root
165156 64 _sysctl sys_sysctl
166157 64 prctl sys_prctl
167158 64 arch_prctl sys_arch_prctl
168159 64 adjtimex sys_adjtimex
169160 64 setrlimit sys_setrlimit
170161 64 chroot sys_chroot
171162 64 sync sys_sync
172163 64 acct sys_acct
173164 64 settimeofday sys_settimeofday
174165 64 mount sys_mount
175166 64 umount2 sys_umount
176167 64 swapon sys_swapon
177168 64 swapoff sys_swapoff
178169 64 reboot sys_reboot
179170 64 sethostname sys_sethostname
180171 64 setdomainname sys_setdomainname
181172 64 iopl stub_iopl
182173 64 ioperm sys_ioperm
183174 64 create_module
184175 64 init_module sys_init_module
185176 64 delete_module sys_delete_module
186177 64 get_kernel_syms
187178 64 query_module
188179 64 quotactl sys_quotactl
189180 64 nfsservctl
190181 64 getpmsg
191182 64 putpmsg
192183 64 afs_syscall
193184 64 tuxcall
194185 64 security
195186 64 gettid sys_gettid
196187 64 readahead sys_readahead
197188 64 setxattr sys_setxattr
198189 64 lsetxattr sys_lsetxattr
199190 64 fsetxattr sys_fsetxattr
200191 64 getxattr sys_getxattr
201192 64 lgetxattr sys_lgetxattr
202193 64 fgetxattr sys_fgetxattr
203194 64 listxattr sys_listxattr
204195 64 llistxattr sys_llistxattr
205196 64 flistxattr sys_flistxattr
206197 64 removexattr sys_removexattr
207198 64 lremovexattr sys_lremovexattr
208199 64 fremovexattr sys_fremovexattr
209200 64 tkill sys_tkill
210201 64 time sys_time
211202 64 futex sys_futex
212203 64 sched_setaffinity sys_sched_setaffinity
213204 64 sched_getaffinity sys_sched_getaffinity
214205 64 set_thread_area
215206 64 io_setup sys_io_setup
216207 64 io_destroy sys_io_destroy
217208 64 io_getevents sys_io_getevents
218209 64 io_submit sys_io_submit
219210 64 io_cancel sys_io_cancel
220211 64 get_thread_area
221212 64 lookup_dcookie sys_lookup_dcookie
222213 64 epoll_create sys_epoll_create
223214 64 epoll_ctl_old
224215 64 epoll_wait_old
225216 64 remap_file_pages sys_remap_file_pages
226217 64 getdents64 sys_getdents64
227218 64 set_tid_address sys_set_tid_address
228219 64 restart_syscall sys_restart_syscall
229220 64 semtimedop sys_semtimedop
230221 64 fadvise64 sys_fadvise64
231222 64 timer_create sys_timer_create
232223 64 timer_settime sys_timer_settime
233224 64 timer_gettime sys_timer_gettime
234225 64 timer_getoverrun sys_timer_getoverrun
235226 64 timer_delete sys_timer_delete
236227 64 clock_settime sys_clock_settime
237228 64 clock_gettime sys_clock_gettime
238229 64 clock_getres sys_clock_getres
239230 64 clock_nanosleep sys_clock_nanosleep
240231 64 exit_group sys_exit_group
241232 64 epoll_wait sys_epoll_wait
242233 64 epoll_ctl sys_epoll_ctl
243234 64 tgkill sys_tgkill
244235 64 utimes sys_utimes
245236 64 vserver
246237 64 mbind sys_mbind
247238 64 set_mempolicy sys_set_mempolicy
248239 64 get_mempolicy sys_get_mempolicy
249240 64 mq_open sys_mq_open
250241 64 mq_unlink sys_mq_unlink
251242 64 mq_timedsend sys_mq_timedsend
252243 64 mq_timedreceive sys_mq_timedreceive
253244 64 mq_notify sys_mq_notify
254245 64 mq_getsetattr sys_mq_getsetattr
255246 64 kexec_load sys_kexec_load
256247 64 waitid sys_waitid
257248 64 add_key sys_add_key
258249 64 request_key sys_request_key
259250 64 keyctl sys_keyctl
260251 64 ioprio_set sys_ioprio_set
261252 64 ioprio_get sys_ioprio_get
262253 64 inotify_init sys_inotify_init
263254 64 inotify_add_watch sys_inotify_add_watch
264255 64 inotify_rm_watch sys_inotify_rm_watch
265256 64 migrate_pages sys_migrate_pages
266257 64 openat sys_openat
267258 64 mkdirat sys_mkdirat
268259 64 mknodat sys_mknodat
269260 64 fchownat sys_fchownat
270261 64 futimesat sys_futimesat
271262 64 newfstatat sys_newfstatat
272263 64 unlinkat sys_unlinkat
273264 64 renameat sys_renameat
274265 64 linkat sys_linkat
275266 64 symlinkat sys_symlinkat
276267 64 readlinkat sys_readlinkat
277268 64 fchmodat sys_fchmodat
278269 64 faccessat sys_faccessat
279270 64 pselect6 sys_pselect6
280271 64 ppoll sys_ppoll
281272 64 unshare sys_unshare
282273 64 set_robust_list sys_set_robust_list
283274 64 get_robust_list sys_get_robust_list
284275 64 splice sys_splice
285276 64 tee sys_tee
286277 64 sync_file_range sys_sync_file_range
287278 64 vmsplice sys_vmsplice
288279 64 move_pages sys_move_pages
289280 64 utimensat sys_utimensat
290281 64 epoll_pwait sys_epoll_pwait
291282 64 signalfd sys_signalfd
292283 64 timerfd_create sys_timerfd_create
293284 64 eventfd sys_eventfd
294285 64 fallocate sys_fallocate
295286 64 timerfd_settime sys_timerfd_settime
296287 64 timerfd_gettime sys_timerfd_gettime
297288 64 accept4 sys_accept4
298289 64 signalfd4 sys_signalfd4
299290 64 eventfd2 sys_eventfd2
300291 64 epoll_create1 sys_epoll_create1
301292 64 dup3 sys_dup3
302293 64 pipe2 sys_pipe2
303294 64 inotify_init1 sys_inotify_init1
304295 64 preadv sys_preadv
305296 64 pwritev sys_pwritev
306297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo
307298 64 perf_event_open sys_perf_event_open
308299 64 recvmmsg sys_recvmmsg
309300 64 fanotify_init sys_fanotify_init
310301 64 fanotify_mark sys_fanotify_mark
311302 64 prlimit64 sys_prlimit64
312303 64 name_to_handle_at sys_name_to_handle_at
313304 64 open_by_handle_at sys_open_by_handle_at
314305 64 clock_adjtime sys_clock_adjtime
315306 64 syncfs sys_syncfs
316307 64 sendmmsg sys_sendmmsg
317308 64 setns sys_setns
318309 64 getcpu sys_getcpu
319310 64 process_vm_readv sys_process_vm_readv
320311 64 process_vm_writev sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
new file mode 100644
index 000000000000..31fd5f1f38f7
--- /dev/null
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -0,0 +1,27 @@
1#!/bin/sh
2
3in="$1"
4out="$2"
5my_abis=`echo "($3)" | tr ',' '|'`
6prefix="$4"
7offset="$5"
8
9fileguard=_ASM_X86_`basename "$out" | sed \
10 -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
11 -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
12grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
13 echo "#ifndef ${fileguard}"
14 echo "#define ${fileguard} 1"
15 echo ""
16
17 while read nr abi name entry ; do
18 if [ -z "$offset" ]; then
19 echo "#define __NR_${prefix}${name} $nr"
20 else
21 echo "#define __NR_${prefix}${name} ($offset + $nr)"
22 fi
23 done
24
25 echo ""
26 echo "#endif /* ${fileguard} */"
27) > "$out"
diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh
new file mode 100644
index 000000000000..0e7f8ec071e7
--- /dev/null
+++ b/arch/x86/syscalls/syscalltbl.sh
@@ -0,0 +1,15 @@
1#!/bin/sh
2
3in="$1"
4out="$2"
5
6grep '^[0-9]' "$in" | sort -n | (
7 while read nr abi name entry compat; do
8 abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
9 if [ -n "$compat" ]; then
10 echo "__SYSCALL_${abi}($nr, $entry, $compat)"
11 elif [ -n "$entry" ]; then
12 echo "__SYSCALL_${abi}($nr, $entry, $entry)"
13 fi
14 done
15) > "$out"
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 1d97bd84b6fb..b2b54d2edf53 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,14 +6,6 @@ menu "UML-specific options"
6 6
7menu "Host processor type and features" 7menu "Host processor type and features"
8 8
9config CMPXCHG_LOCAL
10 bool
11 default n
12
13config CMPXCHG_DOUBLE
14 bool
15 default n
16
17source "arch/x86/Kconfig.cpu" 9source "arch/x86/Kconfig.cpu"
18 10
19endmenu 11endmenu
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 8fb58400e415..5d065b2222d3 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -37,7 +37,8 @@ subarch-$(CONFIG_MODULES) += ../kernel/module.o
37USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o 37USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o
38 38
39extra-y += user-offsets.s 39extra-y += user-offsets.s
40$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) 40$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
41 -Iarch/x86/include/generated
41 42
42UNPROFILE_OBJS := stub_segv.o 43UNPROFILE_OBJS := stub_segv.o
43CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) 44CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING)
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 711b1621747f..2bbe1ec2d96a 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -1,5 +1,15 @@
1#ifndef __SYSDEP_X86_PTRACE_H
2#define __SYSDEP_X86_PTRACE_H
3
1#ifdef __i386__ 4#ifdef __i386__
2#include "ptrace_32.h" 5#include "ptrace_32.h"
3#else 6#else
4#include "ptrace_64.h" 7#include "ptrace_64.h"
5#endif 8#endif
9
10static inline long regs_return_value(struct uml_pt_regs *regs)
11{
12 return UPT_SYSCALL_RET(regs);
13}
14
15#endif /* __SYSDEP_X86_PTRACE_H */
diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S
deleted file mode 100644
index a7ca80d2dceb..000000000000
--- a/arch/x86/um/sys_call_table_32.S
+++ /dev/null
@@ -1,26 +0,0 @@
1#include <linux/linkage.h>
2/* Steal i386 syscall table for our purposes, but with some slight changes.*/
3
4#define sys_iopl sys_ni_syscall
5#define sys_ioperm sys_ni_syscall
6
7#define sys_vm86old sys_ni_syscall
8#define sys_vm86 sys_ni_syscall
9
10#define old_mmap sys_old_mmap
11
12#define ptregs_fork sys_fork
13#define ptregs_execve sys_execve
14#define ptregs_iopl sys_iopl
15#define ptregs_vm86old sys_vm86old
16#define ptregs_clone sys_clone
17#define ptregs_vm86 sys_vm86
18#define ptregs_sigaltstack sys_sigaltstack
19#define ptregs_vfork sys_vfork
20
21.section .rodata,"a"
22
23#include "../kernel/syscall_table_32.S"
24
25ENTRY(syscall_table_size)
26.long .-sys_call_table
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
new file mode 100644
index 000000000000..416bd40c0eba
--- /dev/null
+++ b/arch/x86/um/sys_call_table_32.c
@@ -0,0 +1,55 @@
1/*
2 * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c
3 * with some changes for UML.
4 */
5
6#include <linux/linkage.h>
7#include <linux/sys.h>
8#include <linux/cache.h>
9#include <generated/user_constants.h>
10
11#define __NO_STUBS
12
13/*
14 * Below you can see, in terms of #define's, the differences between the x86-64
15 * and the UML syscall table.
16 */
17
18/* Not going to be implemented by UML, since we have no hardware. */
19#define sys_iopl sys_ni_syscall
20#define sys_ioperm sys_ni_syscall
21
22#define sys_vm86old sys_ni_syscall
23#define sys_vm86 sys_ni_syscall
24
25#define old_mmap sys_old_mmap
26
27#define ptregs_fork sys_fork
28#define ptregs_execve sys_execve
29#define ptregs_iopl sys_iopl
30#define ptregs_vm86old sys_vm86old
31#define ptregs_clone sys_clone
32#define ptregs_vm86 sys_vm86
33#define ptregs_sigaltstack sys_sigaltstack
34#define ptregs_vfork sys_vfork
35
36#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
37#include <asm/syscalls_32.h>
38
39#undef __SYSCALL_I386
40#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
41
42typedef void (*sys_call_ptr_t)(void);
43
44extern void sys_ni_syscall(void);
45
46const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
47 /*
48 * Smells like a compiler bug -- it doesn't work
49 * when the & below is removed.
50 */
51 [0 ... __NR_syscall_max] = &sys_ni_syscall,
52#include <asm/syscalls_32.h>
53};
54
55int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 99522f78b162..fe626c3ba01b 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -1,11 +1,12 @@
1/* 1/*
2 * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c 2 * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c
3 * with some changes for UML. 3 * with some changes for UML.
4 */ 4 */
5 5
6#include <linux/linkage.h> 6#include <linux/linkage.h>
7#include <linux/sys.h> 7#include <linux/sys.h>
8#include <linux/cache.h> 8#include <linux/cache.h>
9#include <generated/user_constants.h>
9 10
10#define __NO_STUBS 11#define __NO_STUBS
11 12
@@ -34,31 +35,23 @@
34#define stub_sigaltstack sys_sigaltstack 35#define stub_sigaltstack sys_sigaltstack
35#define stub_rt_sigreturn sys_rt_sigreturn 36#define stub_rt_sigreturn sys_rt_sigreturn
36 37
37#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 38#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
38#undef _ASM_X86_UNISTD_64_H 39#include <asm/syscalls_64.h>
39#include "../../x86/include/asm/unistd_64.h"
40 40
41#undef __SYSCALL 41#undef __SYSCALL_64
42#define __SYSCALL(nr, sym) [ nr ] = sym, 42#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
43#undef _ASM_X86_UNISTD_64_H
44 43
45typedef void (*sys_call_ptr_t)(void); 44typedef void (*sys_call_ptr_t)(void);
46 45
47extern void sys_ni_syscall(void); 46extern void sys_ni_syscall(void);
48 47
49/* 48const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
50 * We used to have a trick here which made sure that holes in the 49 /*
51 * x86_64 table were filled in with sys_ni_syscall, but a comment in 50 * Smells like a compiler bug -- it doesn't work
52 * unistd_64.h says that holes aren't allowed, so the trick was 51 * when the & below is removed.
53 * removed. 52 */
54 * The trick looked like this 53 [0 ... __NR_syscall_max] = &sys_ni_syscall,
55 * [0 ... UM_NR_syscall_max] = &sys_ni_syscall 54#include <asm/syscalls_64.h>
56 * before including unistd_64.h - the later initializations overwrote
57 * the sys_ni_syscall filler.
58 */
59
60sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
61#include <asm/unistd_64.h>
62}; 55};
63 56
64int syscall_table_size = sizeof(sys_call_table); 57int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ca49be8ddd0c..5edf4f4bbf53 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -8,6 +8,18 @@
8#include <asm/ptrace.h> 8#include <asm/ptrace.h>
9#include <asm/types.h> 9#include <asm/types.h>
10 10
11#ifdef __i386__
12#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
13static char syscalls[] = {
14#include <asm/syscalls_32.h>
15};
16#else
17#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
18static char syscalls[] = {
19#include <asm/syscalls_64.h>
20};
21#endif
22
11#define DEFINE(sym, val) \ 23#define DEFINE(sym, val) \
12 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 24 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
13 25
@@ -77,4 +89,7 @@ void foo(void)
77 DEFINE(UM_PROT_READ, PROT_READ); 89 DEFINE(UM_PROT_READ, PROT_READ);
78 DEFINE(UM_PROT_WRITE, PROT_WRITE); 90 DEFINE(UM_PROT_WRITE, PROT_WRITE);
79 DEFINE(UM_PROT_EXEC, PROT_EXEC); 91 DEFINE(UM_PROT_EXEC, PROT_EXEC);
92
93 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
94 DEFINE(NR_syscalls, sizeof(syscalls));
80} 95}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 26c731a106af..fdce49c7aff6 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -29,7 +29,8 @@ config XEN_PVHVM
29 29
30config XEN_MAX_DOMAIN_MEMORY 30config XEN_MAX_DOMAIN_MEMORY
31 int 31 int
32 default 128 32 default 500 if X86_64
33 default 64 if X86_32
33 depends on XEN 34 depends on XEN
34 help 35 help
35 This only affects the sizing of some bss arrays, the unused 36 This only affects the sizing of some bss arrays, the unused
@@ -48,3 +49,4 @@ config XEN_DEBUG_FS
48 help 49 help
49 Enable statistics output and various tuning options in debugfs. 50 Enable statistics output and various tuning options in debugfs.
50 Enabling this option may incur a significant performance overhead. 51 Enabling this option may incur a significant performance overhead.
52
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 7c0fedd98ea0..ef1db1900d86 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -109,7 +109,7 @@ static const struct file_operations u32_array_fops = {
109 .llseek = no_llseek, 109 .llseek = no_llseek,
110}; 110};
111 111
112struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, 112struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
113 struct dentry *parent, 113 struct dentry *parent,
114 u32 *array, unsigned elements) 114 u32 *array, unsigned elements)
115{ 115{
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index e28132084832..78d25499be5b 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,7 +3,7 @@
3 3
4struct dentry * __init xen_init_debugfs(void); 4struct dentry * __init xen_init_debugfs(void);
5 5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, 6struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
7 struct dentry *parent, 7 struct dentry *parent,
8 u32 *array, unsigned elements); 8 u32 *array, unsigned elements);
9 9
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 5a40d24ba331..3a5f55d51907 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -54,6 +54,20 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page,
54 return 0; 54 return 0;
55} 55}
56 56
57/*
58 * This function is used to map shared frames to store grant status. It is
59 * different from map_pte_fn above, the frames type here is uint64_t.
60 */
61static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
62 unsigned long addr, void *data)
63{
64 uint64_t **frames = (uint64_t **)data;
65
66 set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
67 (*frames)++;
68 return 0;
69}
70
57static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, 71static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
58 unsigned long addr, void *data) 72 unsigned long addr, void *data)
59{ 73{
@@ -64,10 +78,10 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
64 78
65int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, 79int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
66 unsigned long max_nr_gframes, 80 unsigned long max_nr_gframes,
67 struct grant_entry **__shared) 81 void **__shared)
68{ 82{
69 int rc; 83 int rc;
70 struct grant_entry *shared = *__shared; 84 void *shared = *__shared;
71 85
72 if (shared == NULL) { 86 if (shared == NULL) {
73 struct vm_struct *area = 87 struct vm_struct *area =
@@ -83,8 +97,30 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
83 return rc; 97 return rc;
84} 98}
85 99
86void arch_gnttab_unmap_shared(struct grant_entry *shared, 100int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
87 unsigned long nr_gframes) 101 unsigned long max_nr_gframes,
102 grant_status_t **__shared)
103{
104 int rc;
105 grant_status_t *shared = *__shared;
106
107 if (shared == NULL) {
108 /* No need to pass in PTE as we are going to do it
109 * in apply_to_page_range anyhow. */
110 struct vm_struct *area =
111 alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
112 BUG_ON(area == NULL);
113 shared = area->addr;
114 *__shared = shared;
115 }
116
117 rc = apply_to_page_range(&init_mm, (unsigned long)shared,
118 PAGE_SIZE * nr_gframes,
119 map_pte_fn_status, &frames);
120 return rc;
121}
122
123void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
88{ 124{
89 apply_to_page_range(&init_mm, (unsigned long)shared, 125 apply_to_page_range(&init_mm, (unsigned long)shared,
90 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); 126 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f4bf8aa574f4..58a0e46c404d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1852,7 +1852,7 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1852 xen_write_cr3(__pa(initial_page_table)); 1852 xen_write_cr3(__pa(initial_page_table));
1853 1853
1854 memblock_reserve(__pa(xen_start_info->pt_base), 1854 memblock_reserve(__pa(xen_start_info->pt_base),
1855 xen_start_info->nr_pt_frames * PAGE_SIZE)); 1855 xen_start_info->nr_pt_frames * PAGE_SIZE);
1856 1856
1857 return initial_page_table; 1857 return initial_page_table;
1858} 1858}