aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-01-27 11:14:02 -0500
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-01-27 11:14:02 -0500
commit6c02b7b1610f873888af20f291c07730889ff0f9 (patch)
tree1b33e6642cc81605b8d37c0bda0abff0ba64fa2d /arch/x86
parent7a7546b377bdaa25ac77f33d9433c59f259b9688 (diff)
parentdcd6c92267155e70a94b3927bce681ce74b80d1f (diff)
Merge commit 'v3.3-rc1' into stable/for-linus-fixes-3.3
* commit 'v3.3-rc1': (9775 commits) Linux 3.3-rc1 x86, syscall: Need __ARCH_WANT_SYS_IPC for 32 bits qnx4: don't leak ->BitMap on late failure exits qnx4: reduce the insane nesting in qnx4_checkroot() qnx4: di_fname is an array, for crying out loud... KEYS: Permit key_serial() to be called with a const key pointer keys: fix user_defined key sparse messages ima: fix cred sparse warning uml: fix compile for x86-64 MPILIB: Add a missing ENOMEM check tpm: fix (ACPI S3) suspend regression nvme: fix merge error due to change of 'make_request_fn' fn type xen: using EXPORT_SYMBOL requires including export.h gpio: tps65910: Use correct offset for gpio initialization acpi/apei/einj: Add extensions to EINJ from rev 5.0 of acpi spec intel_idle: Split up and provide per CPU initialization func ACPI processor: Remove unneeded variable passed by acpi_processor_hotadd_init V2 tg3: Fix single-vector MSI-X code openvswitch: Fix multipart datapath dumps. ipv6: fix per device IP snmp counters ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/.gitignore1
-rw-r--r--arch/x86/Kconfig90
-rw-r--r--arch/x86/Kconfig.cpu6
-rw-r--r--arch/x86/Kconfig.debug25
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/boot/compressed/Makefile10
-rw-r--r--arch/x86/boot/compressed/eboot.c1022
-rw-r--r--arch/x86/boot/compressed/eboot.h61
-rw-r--r--arch/x86/boot/compressed/efi_stub_32.S86
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S1
-rw-r--r--arch/x86/boot/compressed/head_32.S22
-rw-r--r--arch/x86/boot/compressed/head_64.S20
-rw-r--r--arch/x86/boot/compressed/string.c9
-rw-r--r--arch/x86/boot/header.S158
-rw-r--r--arch/x86/boot/string.c35
-rw-r--r--arch/x86/boot/tools/build.c39
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S638
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S761
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c1070
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c218
-rw-r--r--arch/x86/ia32/Makefile1
-rw-r--r--arch/x86/ia32/ia32entry.S416
-rw-r--r--arch/x86/ia32/nosyscall.c7
-rw-r--r--arch/x86/ia32/syscall_ia32.c25
-rw-r--r--arch/x86/include/asm/Kbuild5
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/apic.h6
-rw-r--r--arch/x86/include/asm/apic_flat_64.h7
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/atomic64_32.h2
-rw-r--r--arch/x86/include/asm/bitops.h76
-rw-r--r--arch/x86/include/asm/bootparam.h2
-rw-r--r--arch/x86/include/asm/cmpxchg.h163
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h46
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h43
-rw-r--r--arch/x86/include/asm/cpufeature.h3
-rw-r--r--arch/x86/include/asm/debugreg.h22
-rw-r--r--arch/x86/include/asm/desc.h12
-rw-r--r--arch/x86/include/asm/div64.h22
-rw-r--r--arch/x86/include/asm/efi.h4
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/ia32_unistd.h13
-rw-r--r--arch/x86/include/asm/init.h2
-rw-r--r--arch/x86/include/asm/insn.h7
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h14
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h90
-rw-r--r--arch/x86/include/asm/mach_timer.h2
-rw-r--r--arch/x86/include/asm/mc146818rtc.h4
-rw-r--r--arch/x86/include/asm/mce.h14
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mrst.h11
-rw-r--r--arch/x86/include/asm/msr.h9
-rw-r--r--arch/x86/include/asm/numachip/numachip_csr.h167
-rw-r--r--arch/x86/include/asm/pci.h9
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/percpu.h77
-rw-r--r--arch/x86/include/asm/perf_event.h44
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/serpent.h63
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/smp.h6
-rw-r--r--arch/x86/include/asm/spinlock.h15
-rw-r--r--arch/x86/include/asm/syscall.h1
-rw-r--r--arch/x86/include/asm/system.h1
-rw-r--r--arch/x86/include/asm/thread_info.h11
-rw-r--r--arch/x86/include/asm/timer.h23
-rw-r--r--arch/x86/include/asm/topology.h4
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/unistd.h55
-rw-r--r--arch/x86/include/asm/unistd_32.h401
-rw-r--r--arch/x86/include/asm/unistd_64.h732
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h107
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h1
-rw-r--r--arch/x86/include/asm/x86_init.h4
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c10
-rw-r--r--arch/x86/kernel/amd_nb.c39
-rw-r--r--arch/x86/kernel/apic/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c113
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c294
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c9
-rw-r--r--arch/x86/kernel/apm_32.c16
-rw-r--r--arch/x86/kernel/asm-offsets.c2
-rw-r--r--arch/x86/kernel/asm-offsets_32.c8
-rw-r--r--arch/x86/kernel/asm-offsets_64.c19
-rw-r--r--arch/x86/kernel/cpu/amd.c17
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c38
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c34
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c204
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c278
-rw-r--r--arch/x86/kernel/cpu/perf_event.h51
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c29
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c2
-rw-r--r--arch/x86/kernel/cpu/powerflags.c3
-rw-r--r--arch/x86/kernel/cpu/proc.c4
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/dumpstack_32.c8
-rw-r--r--arch/x86/kernel/dumpstack_64.c8
-rw-r--r--arch/x86/kernel/e820.c66
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/entry_32.S51
-rw-r--r--arch/x86/kernel/entry_64.S263
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c29
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/irq_32.c5
-rw-r--r--arch/x86/kernel/irq_64.c38
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/kvm.c181
-rw-r--r--arch/x86/kernel/microcode_amd.c209
-rw-r--r--arch/x86/kernel/microcode_core.c91
-rw-r--r--arch/x86/kernel/mpparse.c2
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c102
-rw-r--r--arch/x86/kernel/nmi_selftest.c180
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/ptrace.c28
-rw-r--r--arch/x86/kernel/quirks.c13
-rw-r--r--arch/x86/kernel/reboot.c21
-rw-r--r--arch/x86/kernel/rtc.c5
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c72
-rw-r--r--arch/x86/kernel/smpboot.c20
-rw-r--r--arch/x86/kernel/syscall_32.c25
-rw-r--r--arch/x86/kernel/syscall_64.c20
-rw-r--r--arch/x86/kernel/syscall_table_32.S350
-rw-r--r--arch/x86/kernel/traps.c27
-rw-r--r--arch/x86/kernel/tsc.c40
-rw-r--r--arch/x86/kernel/tsc_sync.c4
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/vsyscall_64.c77
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c670
-rw-r--r--arch/x86/kvm/cpuid.h46
-rw-r--r--arch/x86/kvm/emulate.c436
-rw-r--r--arch/x86/kvm/i8254.c14
-rw-r--r--arch/x86/kvm/i8259.c24
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c547
-rw-r--r--arch/x86/kvm/mmu_audit.c29
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h86
-rw-r--r--arch/x86/kvm/pmu.c533
-rw-r--r--arch/x86/kvm/svm.c15
-rw-r--r--arch/x86/kvm/timer.c26
-rw-r--r--arch/x86/kvm/vmx.c63
-rw-r--r--arch/x86/kvm/x86.c1012
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--arch/x86/lguest/boot.c21
-rw-r--r--arch/x86/lib/inat.c9
-rw-r--r--arch/x86/lib/insn.c4
-rw-r--r--arch/x86/lib/string_32.c8
-rw-r--r--arch/x86/lib/x86-opcode-map.txt610
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c22
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/init.c23
-rw-r--r--arch/x86/mm/init_32.c29
-rw-r--r--arch/x86/mm/init_64.c11
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/numa.c12
-rw-r--r--arch/x86/mm/pageattr.c8
-rw-r--r--arch/x86/mm/srat.c11
-rw-r--r--arch/x86/net/bpf_jit_comp.c4
-rw-r--r--arch/x86/oprofile/Makefile3
-rw-r--r--arch/x86/oprofile/init.c25
-rw-r--r--arch/x86/oprofile/nmi_int.c27
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c50
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/acpi.c75
-rw-r--r--arch/x86/pci/amd_bus.c43
-rw-r--r--arch/x86/pci/broadcom_bus.c62
-rw-r--r--arch/x86/pci/bus_numa.c31
-rw-r--r--arch/x86/pci/common.c19
-rw-r--r--arch/x86/pci/i386.c20
-rw-r--r--arch/x86/pci/legacy.c3
-rw-r--r--arch/x86/pci/numaq_32.c2
-rw-r--r--arch/x86/pci/pcbios.c2
-rw-r--r--arch/x86/platform/efi/efi.c3
-rw-r--r--arch/x86/platform/efi/efi_32.c48
-rw-r--r--arch/x86/platform/geode/alix.c2
-rw-r--r--arch/x86/platform/iris/iris.c2
-rw-r--r--arch/x86/platform/mrst/Makefile6
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c16
-rw-r--r--arch/x86/platform/mrst/mrst.c72
-rw-r--r--arch/x86/platform/uv/tlb_uv.c388
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c2
-rw-r--r--arch/x86/syscalls/Makefile43
-rw-r--r--arch/x86/syscalls/syscall_32.tbl357
-rw-r--r--arch/x86/syscalls/syscall_64.tbl320
-rw-r--r--arch/x86/syscalls/syscallhdr.sh27
-rw-r--r--arch/x86/syscalls/syscalltbl.sh15
-rw-r--r--arch/x86/tools/Makefile11
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk21
-rw-r--r--arch/x86/tools/insn_sanity.c275
-rw-r--r--arch/x86/um/Kconfig8
-rw-r--r--arch/x86/um/Makefile3
-rw-r--r--arch/x86/um/shared/sysdep/ptrace.h10
-rw-r--r--arch/x86/um/sys_call_table_32.S26
-rw-r--r--arch/x86/um/sys_call_table_32.c55
-rw-r--r--arch/x86/um/sys_call_table_64.c33
-rw-r--r--arch/x86/um/user-offsets.c15
-rw-r--r--arch/x86/xen/Kconfig4
-rw-r--r--arch/x86/xen/debugfs.c2
-rw-r--r--arch/x86/xen/debugfs.h2
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/grant-table.c44
-rw-r--r--arch/x86/xen/setup.c20
243 files changed, 11699 insertions, 5364 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
index 028079065af6..7cab8c08e6d1 100644
--- a/arch/x86/.gitignore
+++ b/arch/x86/.gitignore
@@ -1,3 +1,4 @@
1boot/compressed/vmlinux 1boot/compressed/vmlinux
2tools/test_get_len 2tools/test_get_len
3tools/insn_sanity
3 4
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5d1514c263f8..864cc6e6ac8e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,8 +60,12 @@ config X86
60 select PERF_EVENTS 60 select PERF_EVENTS
61 select HAVE_PERF_EVENTS_NMI 61 select HAVE_PERF_EVENTS_NMI
62 select ANON_INODES 62 select ANON_INODES
63 select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
64 select HAVE_CMPXCHG_LOCAL if !M386
65 select HAVE_CMPXCHG_DOUBLE
63 select HAVE_ARCH_KMEMCHECK 66 select HAVE_ARCH_KMEMCHECK
64 select HAVE_USER_RETURN_NOTIFIER 67 select HAVE_USER_RETURN_NOTIFIER
68 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
65 select HAVE_ARCH_JUMP_LABEL 69 select HAVE_ARCH_JUMP_LABEL
66 select HAVE_TEXT_POKE_SMP 70 select HAVE_TEXT_POKE_SMP
67 select HAVE_GENERIC_HARDIRQS 71 select HAVE_GENERIC_HARDIRQS
@@ -77,6 +81,7 @@ config X86
77 select HAVE_BPF_JIT if (X86_64 && NET) 81 select HAVE_BPF_JIT if (X86_64 && NET)
78 select CLKEVT_I8253 82 select CLKEVT_I8253
79 select ARCH_HAVE_NMI_SAFE_CMPXCHG 83 select ARCH_HAVE_NMI_SAFE_CMPXCHG
84 select GENERIC_IOMAP
80 85
81config INSTRUCTION_DECODER 86config INSTRUCTION_DECODER
82 def_bool (KPROBES || PERF_EVENTS) 87 def_bool (KPROBES || PERF_EVENTS)
@@ -120,16 +125,6 @@ config HAVE_LATENCYTOP_SUPPORT
120config MMU 125config MMU
121 def_bool y 126 def_bool y
122 127
123config ZONE_DMA
124 bool "DMA memory allocation support" if EXPERT
125 default y
126 help
127 DMA memory allocation support allows devices with less than 32-bit
128 addressing to allocate within the first 16MB of address space.
129 Disable if no such devices will be used.
130
131 If unsure, say Y.
132
133config SBUS 128config SBUS
134 bool 129 bool
135 130
@@ -142,9 +137,6 @@ config NEED_SG_DMA_LENGTH
142config GENERIC_ISA_DMA 137config GENERIC_ISA_DMA
143 def_bool ISA_DMA_API 138 def_bool ISA_DMA_API
144 139
145config GENERIC_IOMAP
146 def_bool y
147
148config GENERIC_BUG 140config GENERIC_BUG
149 def_bool y 141 def_bool y
150 depends on BUG 142 depends on BUG
@@ -206,9 +198,6 @@ config ZONE_DMA32
206 bool 198 bool
207 default X86_64 199 default X86_64
208 200
209config ARCH_POPULATES_NODE_MAP
210 def_bool y
211
212config AUDIT_ARCH 201config AUDIT_ARCH
213 bool 202 bool
214 default X86_64 203 default X86_64
@@ -256,6 +245,16 @@ source "kernel/Kconfig.freezer"
256 245
257menu "Processor type and features" 246menu "Processor type and features"
258 247
248config ZONE_DMA
249 bool "DMA memory allocation support" if EXPERT
250 default y
251 help
252 DMA memory allocation support allows devices with less than 32-bit
253 addressing to allocate within the first 16MB of address space.
254 Disable if no such devices will be used.
255
256 If unsure, say Y.
257
259source "kernel/time/Kconfig" 258source "kernel/time/Kconfig"
260 259
261config SMP 260config SMP
@@ -345,6 +344,7 @@ config X86_EXTENDED_PLATFORM
345 344
346 If you enable this option then you'll be able to select support 345 If you enable this option then you'll be able to select support
347 for the following (non-PC) 64 bit x86 platforms: 346 for the following (non-PC) 64 bit x86 platforms:
347 Numascale NumaChip
348 ScaleMP vSMP 348 ScaleMP vSMP
349 SGI Ultraviolet 349 SGI Ultraviolet
350 350
@@ -353,6 +353,18 @@ config X86_EXTENDED_PLATFORM
353endif 353endif
354# This is an alphabetically sorted list of 64 bit extended platforms 354# This is an alphabetically sorted list of 64 bit extended platforms
355# Please maintain the alphabetic order if and when there are additions 355# Please maintain the alphabetic order if and when there are additions
356config X86_NUMACHIP
357 bool "Numascale NumaChip"
358 depends on X86_64
359 depends on X86_EXTENDED_PLATFORM
360 depends on NUMA
361 depends on SMP
362 depends on X86_X2APIC
363 depends on !EDAC_AMD64
364 ---help---
365 Adds support for Numascale NumaChip large-SMP systems. Needed to
366 enable more than ~168 cores.
367 If you don't have one of these, you should say N here.
356 368
357config X86_VSMP 369config X86_VSMP
358 bool "ScaleMP vSMP" 370 bool "ScaleMP vSMP"
@@ -392,7 +404,7 @@ config X86_INTEL_CE
392 This option compiles in support for the CE4100 SOC for settop 404 This option compiles in support for the CE4100 SOC for settop
393 boxes and media devices. 405 boxes and media devices.
394 406
395config X86_INTEL_MID 407config X86_WANT_INTEL_MID
396 bool "Intel MID platform support" 408 bool "Intel MID platform support"
397 depends on X86_32 409 depends on X86_32
398 depends on X86_EXTENDED_PLATFORM 410 depends on X86_EXTENDED_PLATFORM
@@ -401,13 +413,19 @@ config X86_INTEL_MID
401 systems which do not have the PCI legacy interfaces (Moorestown, 413 systems which do not have the PCI legacy interfaces (Moorestown,
402 Medfield). If you are building for a PC class system say N here. 414 Medfield). If you are building for a PC class system say N here.
403 415
404if X86_INTEL_MID 416if X86_WANT_INTEL_MID
417
418config X86_INTEL_MID
419 bool
405 420
406config X86_MRST 421config X86_MRST
407 bool "Moorestown MID platform" 422 bool "Moorestown MID platform"
408 depends on PCI 423 depends on PCI
409 depends on PCI_GOANY 424 depends on PCI_GOANY
410 depends on X86_IO_APIC 425 depends on X86_IO_APIC
426 select X86_INTEL_MID
427 select SFI
428 select DW_APB_TIMER
411 select APB_TIMER 429 select APB_TIMER
412 select I2C 430 select I2C
413 select SPI 431 select SPI
@@ -421,6 +439,26 @@ config X86_MRST
421 nor standard legacy replacement devices/features. e.g. Moorestown does 439 nor standard legacy replacement devices/features. e.g. Moorestown does
422 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. 440 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
423 441
442config X86_MDFLD
443 bool "Medfield MID platform"
444 depends on PCI
445 depends on PCI_GOANY
446 depends on X86_IO_APIC
447 select X86_INTEL_MID
448 select SFI
449 select DW_APB_TIMER
450 select APB_TIMER
451 select I2C
452 select SPI
453 select INTEL_SCU_IPC
454 select X86_PLATFORM_DEVICES
455 ---help---
456 Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
457 Internet Device(MID) platform.
458 Unlike standard x86 PCs, Medfield does not have many legacy devices
459 nor standard legacy replacement devices/features. e.g. Medfield does
460 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
461
424endif 462endif
425 463
426config X86_RDC321X 464config X86_RDC321X
@@ -618,7 +656,7 @@ config X86_SUMMIT_NUMA
618 656
619config X86_CYCLONE_TIMER 657config X86_CYCLONE_TIMER
620 def_bool y 658 def_bool y
621 depends on X86_32_NON_STANDARD 659 depends on X86_SUMMIT
622 660
623source "arch/x86/Kconfig.cpu" 661source "arch/x86/Kconfig.cpu"
624 662
@@ -646,9 +684,10 @@ config HPET_EMULATE_RTC
646 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) 684 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
647 685
648config APB_TIMER 686config APB_TIMER
649 def_bool y if MRST 687 def_bool y if X86_INTEL_MID
650 prompt "Langwell APB Timer Support" if X86_MRST 688 prompt "Intel MID APB Timer Support" if X86_INTEL_MID
651 select DW_APB_TIMER 689 select DW_APB_TIMER
690 depends on X86_INTEL_MID && SFI
652 help 691 help
653 APB timer is the replacement for 8254, HPET on X86 MID platforms. 692 APB timer is the replacement for 8254, HPET on X86 MID platforms.
654 The APBT provides a stable time base on SMP 693 The APBT provides a stable time base on SMP
@@ -1476,6 +1515,13 @@ config EFI
1476 resultant kernel should continue to boot on existing non-EFI 1515 resultant kernel should continue to boot on existing non-EFI
1477 platforms. 1516 platforms.
1478 1517
1518config EFI_STUB
1519 bool "EFI stub support"
1520 depends on EFI
1521 ---help---
1522 This kernel feature allows a bzImage to be loaded directly
1523 by EFI firmware without the use of a bootloader.
1524
1479config SECCOMP 1525config SECCOMP
1480 def_bool y 1526 def_bool y
1481 prompt "Enable seccomp to safely compute untrusted bytecode" 1527 prompt "Enable seccomp to safely compute untrusted bytecode"
@@ -1728,7 +1774,7 @@ source "drivers/sfi/Kconfig"
1728 1774
1729config X86_APM_BOOT 1775config X86_APM_BOOT
1730 def_bool y 1776 def_bool y
1731 depends on APM || APM_MODULE 1777 depends on APM
1732 1778
1733menuconfig APM 1779menuconfig APM
1734 tristate "APM (Advanced Power Management) BIOS support" 1780 tristate "APM (Advanced Power Management) BIOS support"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e3ca7e0d858c..3c57033e2211 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,12 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
309config X86_CMPXCHG 309config X86_CMPXCHG
310 def_bool X86_64 || (X86_32 && !M386) 310 def_bool X86_64 || (X86_32 && !M386)
311 311
312config CMPXCHG_LOCAL
313 def_bool X86_64 || (X86_32 && !M386)
314
315config CMPXCHG_DOUBLE
316 def_bool y
317
318config X86_L1_CACHE_SHIFT 312config X86_L1_CACHE_SHIFT
319 int 313 int
320 default "7" if MPENTIUM4 || MPSC 314 default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e1793272..e46c2147397f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,9 +43,9 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_MRST 46config EARLY_PRINTK_INTEL_MID
47 bool "Early printk for MRST platform support" 47 bool "Early printk for Intel MID platform support"
48 depends on EARLY_PRINTK && X86_MRST 48 depends on EARLY_PRINTK && X86_INTEL_MID
49 49
50config EARLY_PRINTK_DBGP 50config EARLY_PRINTK_DBGP
51 bool "Early printk via EHCI debug port" 51 bool "Early printk via EHCI debug port"
@@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW
63 bool "Check for stack overflows" 63 bool "Check for stack overflows"
64 depends on DEBUG_KERNEL 64 depends on DEBUG_KERNEL
65 ---help--- 65 ---help---
66 This option will cause messages to be printed if free stack space 66 Say Y here if you want to check the overflows of kernel, IRQ
67 drops below a certain limit. 67 and exception stacks. This option will cause messages of the
68 stacks in detail when free stack space drops below a certain
69 limit.
70 If in doubt, say "N".
68 71
69config X86_PTDUMP 72config X86_PTDUMP
70 bool "Export kernel pagetable layout to userspace via debugfs" 73 bool "Export kernel pagetable layout to userspace via debugfs"
@@ -284,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS
284 287
285 If unsure, or if you run an older (pre 4.4) gcc, say N. 288 If unsure, or if you run an older (pre 4.4) gcc, say N.
286 289
290config DEBUG_NMI_SELFTEST
291 bool "NMI Selftest"
292 depends on DEBUG_KERNEL && X86_LOCAL_APIC
293 ---help---
294 Enabling this option turns on a quick NMI selftest to verify
295 that the NMI behaves correctly.
296
297 This might help diagnose strange hangs that rely on NMI to
298 function properly.
299
300 If unsure, say N.
301
287endmenu 302endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b02e509072a7..209ba1294592 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -118,6 +118,12 @@ KBUILD_CFLAGS += $(mflags-y)
118KBUILD_AFLAGS += $(mflags-y) 118KBUILD_AFLAGS += $(mflags-y)
119 119
120### 120###
121# Syscall table generation
122
123archheaders:
124 $(Q)$(MAKE) $(build)=arch/x86/syscalls all
125
126###
121# Kernel objects 127# Kernel objects
122 128
123head-y := arch/x86/kernel/head_$(BITS).o 129head-y := arch/x86/kernel/head_$(BITS).o
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 09664efb9cee..b123b9a8f5b3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T
23 23
24hostprogs-y := mkpiggy 24hostprogs-y := mkpiggy
25 25
26$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE 26VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
27 $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
28 $(obj)/piggy.o
29
30ifeq ($(CONFIG_EFI_STUB), y)
31 VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
32endif
33
34$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
27 $(call if_changed,ld) 35 $(call if_changed,ld)
28 @: 36 @:
29 37
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
new file mode 100644
index 000000000000..fec216f4fbc3
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.c
@@ -0,0 +1,1022 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2011 Intel Corporation; author Matt Fleming
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2.
7 *
8 * ----------------------------------------------------------------------- */
9
10#include <linux/efi.h>
11#include <asm/efi.h>
12#include <asm/setup.h>
13#include <asm/desc.h>
14
15#include "eboot.h"
16
17static efi_system_table_t *sys_table;
18
19static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
20 unsigned long *desc_size)
21{
22 efi_memory_desc_t *m = NULL;
23 efi_status_t status;
24 unsigned long key;
25 u32 desc_version;
26
27 *map_size = sizeof(*m) * 32;
28again:
29 /*
30 * Add an additional efi_memory_desc_t because we're doing an
31 * allocation which may be in a new descriptor region.
32 */
33 *map_size += sizeof(*m);
34 status = efi_call_phys3(sys_table->boottime->allocate_pool,
35 EFI_LOADER_DATA, *map_size, (void **)&m);
36 if (status != EFI_SUCCESS)
37 goto fail;
38
39 status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size,
40 m, &key, desc_size, &desc_version);
41 if (status == EFI_BUFFER_TOO_SMALL) {
42 efi_call_phys1(sys_table->boottime->free_pool, m);
43 goto again;
44 }
45
46 if (status != EFI_SUCCESS)
47 efi_call_phys1(sys_table->boottime->free_pool, m);
48
49fail:
50 *map = m;
51 return status;
52}
53
54/*
55 * Allocate at the highest possible address that is not above 'max'.
56 */
57static efi_status_t high_alloc(unsigned long size, unsigned long align,
58 unsigned long *addr, unsigned long max)
59{
60 unsigned long map_size, desc_size;
61 efi_memory_desc_t *map;
62 efi_status_t status;
63 unsigned long nr_pages;
64 u64 max_addr = 0;
65 int i;
66
67 status = __get_map(&map, &map_size, &desc_size);
68 if (status != EFI_SUCCESS)
69 goto fail;
70
71 nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
72again:
73 for (i = 0; i < map_size / desc_size; i++) {
74 efi_memory_desc_t *desc;
75 unsigned long m = (unsigned long)map;
76 u64 start, end;
77
78 desc = (efi_memory_desc_t *)(m + (i * desc_size));
79 if (desc->type != EFI_CONVENTIONAL_MEMORY)
80 continue;
81
82 if (desc->num_pages < nr_pages)
83 continue;
84
85 start = desc->phys_addr;
86 end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
87
88 if ((start + size) > end || (start + size) > max)
89 continue;
90
91 if (end - size > max)
92 end = max;
93
94 if (round_down(end - size, align) < start)
95 continue;
96
97 start = round_down(end - size, align);
98
99 /*
100 * Don't allocate at 0x0. It will confuse code that
101 * checks pointers against NULL.
102 */
103 if (start == 0x0)
104 continue;
105
106 if (start > max_addr)
107 max_addr = start;
108 }
109
110 if (!max_addr)
111 status = EFI_NOT_FOUND;
112 else {
113 status = efi_call_phys4(sys_table->boottime->allocate_pages,
114 EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
115 nr_pages, &max_addr);
116 if (status != EFI_SUCCESS) {
117 max = max_addr;
118 max_addr = 0;
119 goto again;
120 }
121
122 *addr = max_addr;
123 }
124
125free_pool:
126 efi_call_phys1(sys_table->boottime->free_pool, map);
127
128fail:
129 return status;
130}
131
132/*
133 * Allocate at the lowest possible address.
134 */
135static efi_status_t low_alloc(unsigned long size, unsigned long align,
136 unsigned long *addr)
137{
138 unsigned long map_size, desc_size;
139 efi_memory_desc_t *map;
140 efi_status_t status;
141 unsigned long nr_pages;
142 int i;
143
144 status = __get_map(&map, &map_size, &desc_size);
145 if (status != EFI_SUCCESS)
146 goto fail;
147
148 nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
149 for (i = 0; i < map_size / desc_size; i++) {
150 efi_memory_desc_t *desc;
151 unsigned long m = (unsigned long)map;
152 u64 start, end;
153
154 desc = (efi_memory_desc_t *)(m + (i * desc_size));
155
156 if (desc->type != EFI_CONVENTIONAL_MEMORY)
157 continue;
158
159 if (desc->num_pages < nr_pages)
160 continue;
161
162 start = desc->phys_addr;
163 end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
164
165 /*
166 * Don't allocate at 0x0. It will confuse code that
167 * checks pointers against NULL. Skip the first 8
168 * bytes so we start at a nice even number.
169 */
170 if (start == 0x0)
171 start += 8;
172
173 start = round_up(start, align);
174 if ((start + size) > end)
175 continue;
176
177 status = efi_call_phys4(sys_table->boottime->allocate_pages,
178 EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
179 nr_pages, &start);
180 if (status == EFI_SUCCESS) {
181 *addr = start;
182 break;
183 }
184 }
185
186 if (i == map_size / desc_size)
187 status = EFI_NOT_FOUND;
188
189free_pool:
190 efi_call_phys1(sys_table->boottime->free_pool, map);
191fail:
192 return status;
193}
194
195static void low_free(unsigned long size, unsigned long addr)
196{
197 unsigned long nr_pages;
198
199 nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
200 efi_call_phys2(sys_table->boottime->free_pages, addr, size);
201}
202
203static void find_bits(unsigned long mask, u8 *pos, u8 *size)
204{
205 u8 first, len;
206
207 first = 0;
208 len = 0;
209
210 if (mask) {
211 while (!(mask & 0x1)) {
212 mask = mask >> 1;
213 first++;
214 }
215
216 while (mask & 0x1) {
217 mask = mask >> 1;
218 len++;
219 }
220 }
221
222 *pos = first;
223 *size = len;
224}
225
226/*
227 * See if we have Graphics Output Protocol
228 */
229static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
230 unsigned long size)
231{
232 struct efi_graphics_output_protocol *gop, *first_gop;
233 struct efi_pixel_bitmask pixel_info;
234 unsigned long nr_gops;
235 efi_status_t status;
236 void **gop_handle;
237 u16 width, height;
238 u32 fb_base, fb_size;
239 u32 pixels_per_scan_line;
240 int pixel_format;
241 int i;
242
243 status = efi_call_phys3(sys_table->boottime->allocate_pool,
244 EFI_LOADER_DATA, size, &gop_handle);
245 if (status != EFI_SUCCESS)
246 return status;
247
248 status = efi_call_phys5(sys_table->boottime->locate_handle,
249 EFI_LOCATE_BY_PROTOCOL, proto,
250 NULL, &size, gop_handle);
251 if (status != EFI_SUCCESS)
252 goto free_handle;
253
254 first_gop = NULL;
255
256 nr_gops = size / sizeof(void *);
257 for (i = 0; i < nr_gops; i++) {
258 struct efi_graphics_output_mode_info *info;
259 efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
260 void *pciio;
261 void *h = gop_handle[i];
262
263 status = efi_call_phys3(sys_table->boottime->handle_protocol,
264 h, proto, &gop);
265 if (status != EFI_SUCCESS)
266 continue;
267
268 efi_call_phys3(sys_table->boottime->handle_protocol,
269 h, &pciio_proto, &pciio);
270
271 status = efi_call_phys4(gop->query_mode, gop,
272 gop->mode->mode, &size, &info);
273 if (status == EFI_SUCCESS && (!first_gop || pciio)) {
274 /*
275 * Apple provide GOPs that are not backed by
276 * real hardware (they're used to handle
277 * multiple displays). The workaround is to
278 * search for a GOP implementing the PCIIO
279 * protocol, and if one isn't found, to just
280 * fallback to the first GOP.
281 */
282 width = info->horizontal_resolution;
283 height = info->vertical_resolution;
284 fb_base = gop->mode->frame_buffer_base;
285 fb_size = gop->mode->frame_buffer_size;
286 pixel_format = info->pixel_format;
287 pixel_info = info->pixel_information;
288 pixels_per_scan_line = info->pixels_per_scan_line;
289
290 /*
291 * Once we've found a GOP supporting PCIIO,
292 * don't bother looking any further.
293 */
294 if (pciio)
295 break;
296
297 first_gop = gop;
298 }
299 }
300
301 /* Did we find any GOPs? */
302 if (!first_gop)
303 goto free_handle;
304
305 /* EFI framebuffer */
306 si->orig_video_isVGA = VIDEO_TYPE_EFI;
307
308 si->lfb_width = width;
309 si->lfb_height = height;
310 si->lfb_base = fb_base;
311 si->lfb_size = fb_size;
312 si->pages = 1;
313
314 if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
315 si->lfb_depth = 32;
316 si->lfb_linelength = pixels_per_scan_line * 4;
317 si->red_size = 8;
318 si->red_pos = 0;
319 si->green_size = 8;
320 si->green_pos = 8;
321 si->blue_size = 8;
322 si->blue_pos = 16;
323 si->rsvd_size = 8;
324 si->rsvd_pos = 24;
325 } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
326 si->lfb_depth = 32;
327 si->lfb_linelength = pixels_per_scan_line * 4;
328 si->red_size = 8;
329 si->red_pos = 16;
330 si->green_size = 8;
331 si->green_pos = 8;
332 si->blue_size = 8;
333 si->blue_pos = 0;
334 si->rsvd_size = 8;
335 si->rsvd_pos = 24;
336 } else if (pixel_format == PIXEL_BIT_MASK) {
337 find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
338 find_bits(pixel_info.green_mask, &si->green_pos,
339 &si->green_size);
340 find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
341 find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
342 &si->rsvd_size);
343 si->lfb_depth = si->red_size + si->green_size +
344 si->blue_size + si->rsvd_size;
345 si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
346 } else {
347 si->lfb_depth = 4;
348 si->lfb_linelength = si->lfb_width / 2;
349 si->red_size = 0;
350 si->red_pos = 0;
351 si->green_size = 0;
352 si->green_pos = 0;
353 si->blue_size = 0;
354 si->blue_pos = 0;
355 si->rsvd_size = 0;
356 si->rsvd_pos = 0;
357 }
358
359free_handle:
360 efi_call_phys1(sys_table->boottime->free_pool, gop_handle);
361 return status;
362}
363
364/*
365 * See if we have Universal Graphics Adapter (UGA) protocol
366 */
367static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
368 unsigned long size)
369{
370 struct efi_uga_draw_protocol *uga, *first_uga;
371 unsigned long nr_ugas;
372 efi_status_t status;
373 u32 width, height;
374 void **uga_handle = NULL;
375 int i;
376
377 status = efi_call_phys3(sys_table->boottime->allocate_pool,
378 EFI_LOADER_DATA, size, &uga_handle);
379 if (status != EFI_SUCCESS)
380 return status;
381
382 status = efi_call_phys5(sys_table->boottime->locate_handle,
383 EFI_LOCATE_BY_PROTOCOL, uga_proto,
384 NULL, &size, uga_handle);
385 if (status != EFI_SUCCESS)
386 goto free_handle;
387
388 first_uga = NULL;
389
390 nr_ugas = size / sizeof(void *);
391 for (i = 0; i < nr_ugas; i++) {
392 efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
393 void *handle = uga_handle[i];
394 u32 w, h, depth, refresh;
395 void *pciio;
396
397 status = efi_call_phys3(sys_table->boottime->handle_protocol,
398 handle, uga_proto, &uga);
399 if (status != EFI_SUCCESS)
400 continue;
401
402 efi_call_phys3(sys_table->boottime->handle_protocol,
403 handle, &pciio_proto, &pciio);
404
405 status = efi_call_phys5(uga->get_mode, uga, &w, &h,
406 &depth, &refresh);
407 if (status == EFI_SUCCESS && (!first_uga || pciio)) {
408 width = w;
409 height = h;
410
411 /*
412 * Once we've found a UGA supporting PCIIO,
413 * don't bother looking any further.
414 */
415 if (pciio)
416 break;
417
418 first_uga = uga;
419 }
420 }
421
422 if (!first_uga)
423 goto free_handle;
424
425 /* EFI framebuffer */
426 si->orig_video_isVGA = VIDEO_TYPE_EFI;
427
428 si->lfb_depth = 32;
429 si->lfb_width = width;
430 si->lfb_height = height;
431
432 si->red_size = 8;
433 si->red_pos = 16;
434 si->green_size = 8;
435 si->green_pos = 8;
436 si->blue_size = 8;
437 si->blue_pos = 0;
438 si->rsvd_size = 8;
439 si->rsvd_pos = 24;
440
441
442free_handle:
443 efi_call_phys1(sys_table->boottime->free_pool, uga_handle);
444 return status;
445}
446
447void setup_graphics(struct boot_params *boot_params)
448{
449 efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
450 struct screen_info *si;
451 efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
452 efi_status_t status;
453 unsigned long size;
454 void **gop_handle = NULL;
455 void **uga_handle = NULL;
456
457 si = &boot_params->screen_info;
458 memset(si, 0, sizeof(*si));
459
460 size = 0;
461 status = efi_call_phys5(sys_table->boottime->locate_handle,
462 EFI_LOCATE_BY_PROTOCOL, &graphics_proto,
463 NULL, &size, gop_handle);
464 if (status == EFI_BUFFER_TOO_SMALL)
465 status = setup_gop(si, &graphics_proto, size);
466
467 if (status != EFI_SUCCESS) {
468 size = 0;
469 status = efi_call_phys5(sys_table->boottime->locate_handle,
470 EFI_LOCATE_BY_PROTOCOL, &uga_proto,
471 NULL, &size, uga_handle);
472 if (status == EFI_BUFFER_TOO_SMALL)
473 setup_uga(si, &uga_proto, size);
474 }
475}
476
477struct initrd {
478 efi_file_handle_t *handle;
479 u64 size;
480};
481
482/*
483 * Check the cmdline for a LILO-style initrd= arguments.
484 *
485 * We only support loading an initrd from the same filesystem as the
486 * kernel image.
487 */
488static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
489 struct setup_header *hdr)
490{
491 struct initrd *initrds;
492 unsigned long initrd_addr;
493 efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
494 u64 initrd_total;
495 efi_file_io_interface_t *io;
496 efi_file_handle_t *fh;
497 efi_status_t status;
498 int nr_initrds;
499 char *str;
500 int i, j, k;
501
502 initrd_addr = 0;
503 initrd_total = 0;
504
505 str = (char *)(unsigned long)hdr->cmd_line_ptr;
506
507 j = 0; /* See close_handles */
508
509 if (!str || !*str)
510 return EFI_SUCCESS;
511
512 for (nr_initrds = 0; *str; nr_initrds++) {
513 str = strstr(str, "initrd=");
514 if (!str)
515 break;
516
517 str += 7;
518
519 /* Skip any leading slashes */
520 while (*str == '/' || *str == '\\')
521 str++;
522
523 while (*str && *str != ' ' && *str != '\n')
524 str++;
525 }
526
527 if (!nr_initrds)
528 return EFI_SUCCESS;
529
530 status = efi_call_phys3(sys_table->boottime->allocate_pool,
531 EFI_LOADER_DATA,
532 nr_initrds * sizeof(*initrds),
533 &initrds);
534 if (status != EFI_SUCCESS)
535 goto fail;
536
537 str = (char *)(unsigned long)hdr->cmd_line_ptr;
538 for (i = 0; i < nr_initrds; i++) {
539 struct initrd *initrd;
540 efi_file_handle_t *h;
541 efi_file_info_t *info;
542 efi_char16_t filename[256];
543 unsigned long info_sz;
544 efi_guid_t info_guid = EFI_FILE_INFO_ID;
545 efi_char16_t *p;
546 u64 file_sz;
547
548 str = strstr(str, "initrd=");
549 if (!str)
550 break;
551
552 str += 7;
553
554 initrd = &initrds[i];
555 p = filename;
556
557 /* Skip any leading slashes */
558 while (*str == '/' || *str == '\\')
559 str++;
560
561 while (*str && *str != ' ' && *str != '\n') {
562 if (p >= filename + sizeof(filename))
563 break;
564
565 *p++ = *str++;
566 }
567
568 *p = '\0';
569
570 /* Only open the volume once. */
571 if (!i) {
572 efi_boot_services_t *boottime;
573
574 boottime = sys_table->boottime;
575
576 status = efi_call_phys3(boottime->handle_protocol,
577 image->device_handle, &fs_proto, &io);
578 if (status != EFI_SUCCESS)
579 goto free_initrds;
580
581 status = efi_call_phys2(io->open_volume, io, &fh);
582 if (status != EFI_SUCCESS)
583 goto free_initrds;
584 }
585
586 status = efi_call_phys5(fh->open, fh, &h, filename,
587 EFI_FILE_MODE_READ, (u64)0);
588 if (status != EFI_SUCCESS)
589 goto close_handles;
590
591 initrd->handle = h;
592
593 info_sz = 0;
594 status = efi_call_phys4(h->get_info, h, &info_guid,
595 &info_sz, NULL);
596 if (status != EFI_BUFFER_TOO_SMALL)
597 goto close_handles;
598
599grow:
600 status = efi_call_phys3(sys_table->boottime->allocate_pool,
601 EFI_LOADER_DATA, info_sz, &info);
602 if (status != EFI_SUCCESS)
603 goto close_handles;
604
605 status = efi_call_phys4(h->get_info, h, &info_guid,
606 &info_sz, info);
607 if (status == EFI_BUFFER_TOO_SMALL) {
608 efi_call_phys1(sys_table->boottime->free_pool, info);
609 goto grow;
610 }
611
612 file_sz = info->file_size;
613 efi_call_phys1(sys_table->boottime->free_pool, info);
614
615 if (status != EFI_SUCCESS)
616 goto close_handles;
617
618 initrd->size = file_sz;
619 initrd_total += file_sz;
620 }
621
622 if (initrd_total) {
623 unsigned long addr;
624
625 /*
626 * Multiple initrd's need to be at consecutive
627 * addresses in memory, so allocate enough memory for
628 * all the initrd's.
629 */
630 status = high_alloc(initrd_total, 0x1000,
631 &initrd_addr, hdr->initrd_addr_max);
632 if (status != EFI_SUCCESS)
633 goto close_handles;
634
635 /* We've run out of free low memory. */
636 if (initrd_addr > hdr->initrd_addr_max) {
637 status = EFI_INVALID_PARAMETER;
638 goto free_initrd_total;
639 }
640
641 addr = initrd_addr;
642 for (j = 0; j < nr_initrds; j++) {
643 u64 size;
644
645 size = initrds[j].size;
646 while (size) {
647 u64 chunksize;
648 if (size > EFI_READ_CHUNK_SIZE)
649 chunksize = EFI_READ_CHUNK_SIZE;
650 else
651 chunksize = size;
652 status = efi_call_phys3(fh->read,
653 initrds[j].handle,
654 &chunksize, addr);
655 if (status != EFI_SUCCESS)
656 goto free_initrd_total;
657 addr += chunksize;
658 size -= chunksize;
659 }
660
661 efi_call_phys1(fh->close, initrds[j].handle);
662 }
663
664 }
665
666 efi_call_phys1(sys_table->boottime->free_pool, initrds);
667
668 hdr->ramdisk_image = initrd_addr;
669 hdr->ramdisk_size = initrd_total;
670
671 return status;
672
673free_initrd_total:
674 low_free(initrd_total, initrd_addr);
675
676close_handles:
677 for (k = j; k < nr_initrds; k++)
678 efi_call_phys1(fh->close, initrds[k].handle);
679free_initrds:
680 efi_call_phys1(sys_table->boottime->free_pool, initrds);
681fail:
682 hdr->ramdisk_image = 0;
683 hdr->ramdisk_size = 0;
684
685 return status;
686}
687
688/*
689 * Because the x86 boot code expects to be passed a boot_params we
690 * need to create one ourselves (usually the bootloader would create
691 * one for us).
692 */
693static efi_status_t make_boot_params(struct boot_params *boot_params,
694 efi_loaded_image_t *image,
695 void *handle)
696{
697 struct efi_info *efi = &boot_params->efi_info;
698 struct apm_bios_info *bi = &boot_params->apm_bios_info;
699 struct sys_desc_table *sdt = &boot_params->sys_desc_table;
700 struct e820entry *e820_map = &boot_params->e820_map[0];
701 struct e820entry *prev = NULL;
702 struct setup_header *hdr = &boot_params->hdr;
703 unsigned long size, key, desc_size, _size;
704 efi_memory_desc_t *mem_map;
705 void *options = image->load_options;
706 u32 load_options_size = image->load_options_size / 2; /* ASCII */
707 int options_size = 0;
708 efi_status_t status;
709 __u32 desc_version;
710 unsigned long cmdline;
711 u8 nr_entries;
712 u16 *s2;
713 u8 *s1;
714 int i;
715
716 hdr->type_of_loader = 0x21;
717
718 /* Convert unicode cmdline to ascii */
719 cmdline = 0;
720 s2 = (u16 *)options;
721
722 if (s2) {
723 while (*s2 && *s2 != '\n' && options_size < load_options_size) {
724 s2++;
725 options_size++;
726 }
727
728 if (options_size) {
729 if (options_size > hdr->cmdline_size)
730 options_size = hdr->cmdline_size;
731
732 options_size++; /* NUL termination */
733
734 status = low_alloc(options_size, 1, &cmdline);
735 if (status != EFI_SUCCESS)
736 goto fail;
737
738 s1 = (u8 *)(unsigned long)cmdline;
739 s2 = (u16 *)options;
740
741 for (i = 0; i < options_size - 1; i++)
742 *s1++ = *s2++;
743
744 *s1 = '\0';
745 }
746 }
747
748 hdr->cmd_line_ptr = cmdline;
749
750 hdr->ramdisk_image = 0;
751 hdr->ramdisk_size = 0;
752
753 status = handle_ramdisks(image, hdr);
754 if (status != EFI_SUCCESS)
755 goto free_cmdline;
756
757 setup_graphics(boot_params);
758
759 /* Clear APM BIOS info */
760 memset(bi, 0, sizeof(*bi));
761
762 memset(sdt, 0, sizeof(*sdt));
763
764 memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
765
766 size = sizeof(*mem_map) * 32;
767
768again:
769 size += sizeof(*mem_map);
770 _size = size;
771 status = low_alloc(size, 1, (unsigned long *)&mem_map);
772 if (status != EFI_SUCCESS)
773 goto free_cmdline;
774
775 status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
776 mem_map, &key, &desc_size, &desc_version);
777 if (status == EFI_BUFFER_TOO_SMALL) {
778 low_free(_size, (unsigned long)mem_map);
779 goto again;
780 }
781
782 if (status != EFI_SUCCESS)
783 goto free_mem_map;
784
785 efi->efi_systab = (unsigned long)sys_table;
786 efi->efi_memdesc_size = desc_size;
787 efi->efi_memdesc_version = desc_version;
788 efi->efi_memmap = (unsigned long)mem_map;
789 efi->efi_memmap_size = size;
790
791#ifdef CONFIG_X86_64
792 efi->efi_systab_hi = (unsigned long)sys_table >> 32;
793 efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
794#endif
795
796 /* Might as well exit boot services now */
797 status = efi_call_phys2(sys_table->boottime->exit_boot_services,
798 handle, key);
799 if (status != EFI_SUCCESS)
800 goto free_mem_map;
801
802 /* Historic? */
803 boot_params->alt_mem_k = 32 * 1024;
804
805 /*
806 * Convert the EFI memory map to E820.
807 */
808 nr_entries = 0;
809 for (i = 0; i < size / desc_size; i++) {
810 efi_memory_desc_t *d;
811 unsigned int e820_type = 0;
812 unsigned long m = (unsigned long)mem_map;
813
814 d = (efi_memory_desc_t *)(m + (i * desc_size));
815 switch (d->type) {
816 case EFI_RESERVED_TYPE:
817 case EFI_RUNTIME_SERVICES_CODE:
818 case EFI_RUNTIME_SERVICES_DATA:
819 case EFI_MEMORY_MAPPED_IO:
820 case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
821 case EFI_PAL_CODE:
822 e820_type = E820_RESERVED;
823 break;
824
825 case EFI_UNUSABLE_MEMORY:
826 e820_type = E820_UNUSABLE;
827 break;
828
829 case EFI_ACPI_RECLAIM_MEMORY:
830 e820_type = E820_ACPI;
831 break;
832
833 case EFI_LOADER_CODE:
834 case EFI_LOADER_DATA:
835 case EFI_BOOT_SERVICES_CODE:
836 case EFI_BOOT_SERVICES_DATA:
837 case EFI_CONVENTIONAL_MEMORY:
838 e820_type = E820_RAM;
839 break;
840
841 case EFI_ACPI_MEMORY_NVS:
842 e820_type = E820_NVS;
843 break;
844
845 default:
846 continue;
847 }
848
849 /* Merge adjacent mappings */
850 if (prev && prev->type == e820_type &&
851 (prev->addr + prev->size) == d->phys_addr)
852 prev->size += d->num_pages << 12;
853 else {
854 e820_map->addr = d->phys_addr;
855 e820_map->size = d->num_pages << 12;
856 e820_map->type = e820_type;
857 prev = e820_map++;
858 nr_entries++;
859 }
860 }
861
862 boot_params->e820_entries = nr_entries;
863
864 return EFI_SUCCESS;
865
866free_mem_map:
867 low_free(_size, (unsigned long)mem_map);
868free_cmdline:
869 if (options_size)
870 low_free(options_size, hdr->cmd_line_ptr);
871fail:
872 return status;
873}
874
875/*
876 * On success we return a pointer to a boot_params structure, and NULL
877 * on failure.
878 */
879struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
880{
881 struct boot_params *boot_params;
882 unsigned long start, nr_pages;
883 struct desc_ptr *gdt, *idt;
884 efi_loaded_image_t *image;
885 struct setup_header *hdr;
886 efi_status_t status;
887 efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
888 struct desc_struct *desc;
889
890 sys_table = _table;
891
892 /* Check if we were booted by the EFI firmware */
893 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
894 goto fail;
895
896 status = efi_call_phys3(sys_table->boottime->handle_protocol,
897 handle, &proto, (void *)&image);
898 if (status != EFI_SUCCESS)
899 goto fail;
900
901 status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
902 if (status != EFI_SUCCESS)
903 goto fail;
904
905 memset(boot_params, 0x0, 0x4000);
906
907 /* Copy first two sectors to boot_params */
908 memcpy(boot_params, image->image_base, 1024);
909
910 hdr = &boot_params->hdr;
911
912 /*
913 * The EFI firmware loader could have placed the kernel image
914 * anywhere in memory, but the kernel has various restrictions
915 * on the max physical address it can run at. Attempt to move
916 * the kernel to boot_params.pref_address, or as low as
917 * possible.
918 */
919 start = hdr->pref_address;
920 nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
921
922 status = efi_call_phys4(sys_table->boottime->allocate_pages,
923 EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
924 nr_pages, &start);
925 if (status != EFI_SUCCESS) {
926 status = low_alloc(hdr->init_size, hdr->kernel_alignment,
927 &start);
928 if (status != EFI_SUCCESS)
929 goto fail;
930 }
931
932 hdr->code32_start = (__u32)start;
933 hdr->pref_address = (__u64)(unsigned long)image->image_base;
934
935 memcpy((void *)start, image->image_base, image->image_size);
936
937 status = efi_call_phys3(sys_table->boottime->allocate_pool,
938 EFI_LOADER_DATA, sizeof(*gdt),
939 (void **)&gdt);
940 if (status != EFI_SUCCESS)
941 goto fail;
942
943 gdt->size = 0x800;
944 status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
945 if (status != EFI_SUCCESS)
946 goto fail;
947
948 status = efi_call_phys3(sys_table->boottime->allocate_pool,
949 EFI_LOADER_DATA, sizeof(*idt),
950 (void **)&idt);
951 if (status != EFI_SUCCESS)
952 goto fail;
953
954 idt->size = 0;
955 idt->address = 0;
956
957 status = make_boot_params(boot_params, image, handle);
958 if (status != EFI_SUCCESS)
959 goto fail;
960
961 memset((char *)gdt->address, 0x0, gdt->size);
962 desc = (struct desc_struct *)gdt->address;
963
964 /* The first GDT is a dummy and the second is unused. */
965 desc += 2;
966
967 desc->limit0 = 0xffff;
968 desc->base0 = 0x0000;
969 desc->base1 = 0x0000;
970 desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
971 desc->s = DESC_TYPE_CODE_DATA;
972 desc->dpl = 0;
973 desc->p = 1;
974 desc->limit = 0xf;
975 desc->avl = 0;
976 desc->l = 0;
977 desc->d = SEG_OP_SIZE_32BIT;
978 desc->g = SEG_GRANULARITY_4KB;
979 desc->base2 = 0x00;
980
981 desc++;
982 desc->limit0 = 0xffff;
983 desc->base0 = 0x0000;
984 desc->base1 = 0x0000;
985 desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
986 desc->s = DESC_TYPE_CODE_DATA;
987 desc->dpl = 0;
988 desc->p = 1;
989 desc->limit = 0xf;
990 desc->avl = 0;
991 desc->l = 0;
992 desc->d = SEG_OP_SIZE_32BIT;
993 desc->g = SEG_GRANULARITY_4KB;
994 desc->base2 = 0x00;
995
996#ifdef CONFIG_X86_64
997 /* Task segment value */
998 desc++;
999 desc->limit0 = 0x0000;
1000 desc->base0 = 0x0000;
1001 desc->base1 = 0x0000;
1002 desc->type = SEG_TYPE_TSS;
1003 desc->s = 0;
1004 desc->dpl = 0;
1005 desc->p = 1;
1006 desc->limit = 0x0;
1007 desc->avl = 0;
1008 desc->l = 0;
1009 desc->d = 0;
1010 desc->g = SEG_GRANULARITY_4KB;
1011 desc->base2 = 0x00;
1012#endif /* CONFIG_X86_64 */
1013
1014 asm volatile ("lidt %0" : : "m" (*idt));
1015 asm volatile ("lgdt %0" : : "m" (*gdt));
1016
1017 asm volatile("cli");
1018
1019 return boot_params;
1020fail:
1021 return NULL;
1022}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
new file mode 100644
index 000000000000..39251663e65b
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.h
@@ -0,0 +1,61 @@
1#ifndef BOOT_COMPRESSED_EBOOT_H
2#define BOOT_COMPRESSED_EBOOT_H
3
4#define SEG_TYPE_DATA (0 << 3)
5#define SEG_TYPE_READ_WRITE (1 << 1)
6#define SEG_TYPE_CODE (1 << 3)
7#define SEG_TYPE_EXEC_READ (1 << 1)
8#define SEG_TYPE_TSS ((1 << 3) | (1 << 0))
9#define SEG_OP_SIZE_32BIT (1 << 0)
10#define SEG_GRANULARITY_4KB (1 << 0)
11
12#define DESC_TYPE_CODE_DATA (1 << 0)
13
14#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT)
15#define EFI_READ_CHUNK_SIZE (1024 * 1024)
16
17#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0
18#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1
19#define PIXEL_BIT_MASK 2
20#define PIXEL_BLT_ONLY 3
21#define PIXEL_FORMAT_MAX 4
22
23struct efi_pixel_bitmask {
24 u32 red_mask;
25 u32 green_mask;
26 u32 blue_mask;
27 u32 reserved_mask;
28};
29
30struct efi_graphics_output_mode_info {
31 u32 version;
32 u32 horizontal_resolution;
33 u32 vertical_resolution;
34 int pixel_format;
35 struct efi_pixel_bitmask pixel_information;
36 u32 pixels_per_scan_line;
37} __packed;
38
39struct efi_graphics_output_protocol_mode {
40 u32 max_mode;
41 u32 mode;
42 unsigned long info;
43 unsigned long size_of_info;
44 u64 frame_buffer_base;
45 unsigned long frame_buffer_size;
46} __packed;
47
48struct efi_graphics_output_protocol {
49 void *query_mode;
50 unsigned long set_mode;
51 unsigned long blt;
52 struct efi_graphics_output_protocol_mode *mode;
53};
54
55struct efi_uga_draw_protocol {
56 void *get_mode;
57 void *set_mode;
58 void *blt;
59};
60
61#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
new file mode 100644
index 000000000000..a53440e81d52
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_32.S
@@ -0,0 +1,86 @@
1/*
2 * EFI call stub for IA32.
3 *
4 * This stub allows us to make EFI calls in physical mode with interrupts
5 * turned off. Note that this implementation is different from the one in
6 * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
7 * mode at this point.
8 */
9
10#include <linux/linkage.h>
11#include <asm/page_types.h>
12
13/*
14 * efi_call_phys(void *, ...) is a function with variable parameters.
15 * All the callers of this function assure that all the parameters are 4-bytes.
16 */
17
18/*
19 * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
20 * So we'd better save all of them at the beginning of this function and restore
21 * at the end no matter how many we use, because we can not assure EFI runtime
22 * service functions will comply with gcc calling convention, too.
23 */
24
25.text
26ENTRY(efi_call_phys)
27 /*
28 * 0. The function can only be called in Linux kernel. So CS has been
29 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
30 * the values of these registers are the same. And, the corresponding
31 * GDT entries are identical. So I will do nothing about segment reg
32 * and GDT, but change GDT base register in prelog and epilog.
33 */
34
35 /*
36 * 1. Because we haven't been relocated by this point we need to
37 * use relative addressing.
38 */
39 call 1f
401: popl %edx
41 subl $1b, %edx
42
43 /*
44 * 2. Now on the top of stack is the return
45 * address in the caller of efi_call_phys(), then parameter 1,
46 * parameter 2, ..., param n. To make things easy, we save the return
47 * address of efi_call_phys in a global variable.
48 */
49 popl %ecx
50 movl %ecx, saved_return_addr(%edx)
51 /* get the function pointer into ECX*/
52 popl %ecx
53 movl %ecx, efi_rt_function_ptr(%edx)
54
55 /*
56 * 3. Call the physical function.
57 */
58 call *%ecx
59
60 /*
61 * 4. Balance the stack. And because EAX contain the return value,
62 * we'd better not clobber it. We need to calculate our address
63 * again because %ecx and %edx are not preserved across EFI function
64 * calls.
65 */
66 call 1f
671: popl %edx
68 subl $1b, %edx
69
70 movl efi_rt_function_ptr(%edx), %ecx
71 pushl %ecx
72
73 /*
74 * 10. Push the saved return address onto the stack and return.
75 */
76 movl saved_return_addr(%edx), %ecx
77 pushl %ecx
78 ret
79ENDPROC(efi_call_phys)
80.previous
81
82.data
83saved_return_addr:
84 .long 0
85efi_rt_function_ptr:
86 .long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
new file mode 100644
index 000000000000..cedc60de86eb
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -0,0 +1 @@
#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 67a655a39ce4..a0559930a180 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -32,6 +32,28 @@
32 32
33 __HEAD 33 __HEAD
34ENTRY(startup_32) 34ENTRY(startup_32)
35#ifdef CONFIG_EFI_STUB
36 /*
37 * We don't need the return address, so set up the stack so
38 * efi_main() can find its arugments.
39 */
40 add $0x4, %esp
41
42 call efi_main
43 cmpl $0, %eax
44 je preferred_addr
45 movl %eax, %esi
46 call 1f
471:
48 popl %eax
49 subl $1b, %eax
50 subl BP_pref_address(%esi), %eax
51 add BP_code32_start(%esi), %eax
52 leal preferred_addr(%eax), %eax
53 jmp *%eax
54
55preferred_addr:
56#endif
35 cld 57 cld
36 /* 58 /*
37 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 59 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 35af09d13dc1..558d76ce23bc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -199,6 +199,26 @@ ENTRY(startup_64)
199 * an identity mapped page table being provied that maps our 199 * an identity mapped page table being provied that maps our
200 * entire text+data+bss and hopefully all of memory. 200 * entire text+data+bss and hopefully all of memory.
201 */ 201 */
202#ifdef CONFIG_EFI_STUB
203 pushq %rsi
204 mov %rcx, %rdi
205 mov %rdx, %rsi
206 call efi_main
207 popq %rsi
208 cmpq $0,%rax
209 je preferred_addr
210 movq %rax,%rsi
211 call 1f
2121:
213 popq %rax
214 subq $1b, %rax
215 subq BP_pref_address(%rsi), %rax
216 add BP_code32_start(%esi), %eax
217 leaq preferred_addr(%rax), %rax
218 jmp *%rax
219
220preferred_addr:
221#endif
202 222
203 /* Setup data segments. */ 223 /* Setup data segments. */
204 xorl %eax, %eax 224 xorl %eax, %eax
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 19b3e693cd72..ffb9c5c9d748 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,2 +1,11 @@
1#include "misc.h" 1#include "misc.h"
2
3int memcmp(const void *s1, const void *s2, size_t len)
4{
5 u8 diff;
6 asm("repe; cmpsb; setnz %0"
7 : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
8 return diff;
9}
10
2#include "../string.c" 11#include "../string.c"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index bdb4d458ec8c..f1bbeeb09148 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -45,6 +45,11 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
45 45
46 .global bootsect_start 46 .global bootsect_start
47bootsect_start: 47bootsect_start:
48#ifdef CONFIG_EFI_STUB
49 # "MZ", MS-DOS header
50 .byte 0x4d
51 .byte 0x5a
52#endif
48 53
49 # Normalize the start address 54 # Normalize the start address
50 ljmp $BOOTSEG, $start2 55 ljmp $BOOTSEG, $start2
@@ -79,6 +84,14 @@ bs_die:
79 # invoke the BIOS reset code... 84 # invoke the BIOS reset code...
80 ljmp $0xf000,$0xfff0 85 ljmp $0xf000,$0xfff0
81 86
87#ifdef CONFIG_EFI_STUB
88 .org 0x3c
89 #
90 # Offset to the PE header.
91 #
92 .long pe_header
93#endif /* CONFIG_EFI_STUB */
94
82 .section ".bsdata", "a" 95 .section ".bsdata", "a"
83bugger_off_msg: 96bugger_off_msg:
84 .ascii "Direct booting from floppy is no longer supported.\r\n" 97 .ascii "Direct booting from floppy is no longer supported.\r\n"
@@ -87,6 +100,141 @@ bugger_off_msg:
87 .ascii "Remove disk and press any key to reboot . . .\r\n" 100 .ascii "Remove disk and press any key to reboot . . .\r\n"
88 .byte 0 101 .byte 0
89 102
103#ifdef CONFIG_EFI_STUB
104pe_header:
105 .ascii "PE"
106 .word 0
107
108coff_header:
109#ifdef CONFIG_X86_32
110 .word 0x14c # i386
111#else
112 .word 0x8664 # x86-64
113#endif
114 .word 2 # nr_sections
115 .long 0 # TimeDateStamp
116 .long 0 # PointerToSymbolTable
117 .long 1 # NumberOfSymbols
118 .word section_table - optional_header # SizeOfOptionalHeader
119#ifdef CONFIG_X86_32
120 .word 0x306 # Characteristics.
121 # IMAGE_FILE_32BIT_MACHINE |
122 # IMAGE_FILE_DEBUG_STRIPPED |
123 # IMAGE_FILE_EXECUTABLE_IMAGE |
124 # IMAGE_FILE_LINE_NUMS_STRIPPED
125#else
126 .word 0x206 # Characteristics
127 # IMAGE_FILE_DEBUG_STRIPPED |
128 # IMAGE_FILE_EXECUTABLE_IMAGE |
129 # IMAGE_FILE_LINE_NUMS_STRIPPED
130#endif
131
132optional_header:
133#ifdef CONFIG_X86_32
134 .word 0x10b # PE32 format
135#else
136 .word 0x20b # PE32+ format
137#endif
138 .byte 0x02 # MajorLinkerVersion
139 .byte 0x14 # MinorLinkerVersion
140
141 # Filled in by build.c
142 .long 0 # SizeOfCode
143
144 .long 0 # SizeOfInitializedData
145 .long 0 # SizeOfUninitializedData
146
147 # Filled in by build.c
148 .long 0x0000 # AddressOfEntryPoint
149
150 .long 0x0000 # BaseOfCode
151#ifdef CONFIG_X86_32
152 .long 0 # data
153#endif
154
155extra_header_fields:
156#ifdef CONFIG_X86_32
157 .long 0 # ImageBase
158#else
159 .quad 0 # ImageBase
160#endif
161 .long 0x1000 # SectionAlignment
162 .long 0x200 # FileAlignment
163 .word 0 # MajorOperatingSystemVersion
164 .word 0 # MinorOperatingSystemVersion
165 .word 0 # MajorImageVersion
166 .word 0 # MinorImageVersion
167 .word 0 # MajorSubsystemVersion
168 .word 0 # MinorSubsystemVersion
169 .long 0 # Win32VersionValue
170
171 #
172 # The size of the bzImage is written in tools/build.c
173 #
174 .long 0 # SizeOfImage
175
176 .long 0x200 # SizeOfHeaders
177 .long 0 # CheckSum
178 .word 0xa # Subsystem (EFI application)
179 .word 0 # DllCharacteristics
180#ifdef CONFIG_X86_32
181 .long 0 # SizeOfStackReserve
182 .long 0 # SizeOfStackCommit
183 .long 0 # SizeOfHeapReserve
184 .long 0 # SizeOfHeapCommit
185#else
186 .quad 0 # SizeOfStackReserve
187 .quad 0 # SizeOfStackCommit
188 .quad 0 # SizeOfHeapReserve
189 .quad 0 # SizeOfHeapCommit
190#endif
191 .long 0 # LoaderFlags
192 .long 0x1 # NumberOfRvaAndSizes
193
194 .quad 0 # ExportTable
195 .quad 0 # ImportTable
196 .quad 0 # ResourceTable
197 .quad 0 # ExceptionTable
198 .quad 0 # CertificationTable
199 .quad 0 # BaseRelocationTable
200
201 # Section table
202section_table:
203 .ascii ".text"
204 .byte 0
205 .byte 0
206 .byte 0
207 .long 0
208 .long 0x0 # startup_{32,64}
209 .long 0 # Size of initialized data
210 # on disk
211 .long 0x0 # startup_{32,64}
212 .long 0 # PointerToRelocations
213 .long 0 # PointerToLineNumbers
214 .word 0 # NumberOfRelocations
215 .word 0 # NumberOfLineNumbers
216 .long 0x60500020 # Characteristics (section flags)
217
218 #
219 # The EFI application loader requires a relocation section
220 # because EFI applications are relocatable and not having
221 # this section seems to confuse it. But since we don't need
222 # the loader to fixup any relocs for us just fill it with a
223 # single dummy reloc.
224 #
225 .ascii ".reloc"
226 .byte 0
227 .byte 0
228 .long reloc_end - reloc_start
229 .long reloc_start
230 .long reloc_end - reloc_start # SizeOfRawData
231 .long reloc_start # PointerToRawData
232 .long 0 # PointerToRelocations
233 .long 0 # PointerToLineNumbers
234 .word 0 # NumberOfRelocations
235 .word 0 # NumberOfLineNumbers
236 .long 0x42100040 # Characteristics (section flags)
237#endif /* CONFIG_EFI_STUB */
90 238
91 # Kernel attributes; used by setup. This is part 1 of the 239 # Kernel attributes; used by setup. This is part 1 of the
92 # header, from the old boot sector. 240 # header, from the old boot sector.
@@ -318,3 +466,13 @@ die:
318setup_corrupt: 466setup_corrupt:
319 .byte 7 467 .byte 7
320 .string "No setup signature found...\n" 468 .string "No setup signature found...\n"
469
470 .data
471dummy: .long 0
472
473 .section .reloc
474reloc_start:
475 .long dummy - reloc_start
476 .long 10
477 .word 0
478reloc_end:
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 3cbc4058dd26..574dedfe2890 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
111 111
112 return result; 112 return result;
113} 113}
114
115/**
116 * strlen - Find the length of a string
117 * @s: The string to be sized
118 */
119size_t strlen(const char *s)
120{
121 const char *sc;
122
123 for (sc = s; *sc != '\0'; ++sc)
124 /* nothing */;
125 return sc - s;
126}
127
128/**
129 * strstr - Find the first substring in a %NUL terminated string
130 * @s1: The string to be searched
131 * @s2: The string to search for
132 */
133char *strstr(const char *s1, const char *s2)
134{
135 size_t l1, l2;
136
137 l2 = strlen(s2);
138 if (!l2)
139 return (char *)s1;
140 l1 = strlen(s1);
141 while (l1 >= l2) {
142 l1--;
143 if (!memcmp(s1, s2, l2))
144 return (char *)s1;
145 s1++;
146 }
147 return NULL;
148}
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index fdc60a0b3c20..4e9bd6bcafa6 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -135,6 +135,9 @@ static void usage(void)
135 135
136int main(int argc, char ** argv) 136int main(int argc, char ** argv)
137{ 137{
138#ifdef CONFIG_EFI_STUB
139 unsigned int file_sz, pe_header;
140#endif
138 unsigned int i, sz, setup_sectors; 141 unsigned int i, sz, setup_sectors;
139 int c; 142 int c;
140 u32 sys_size; 143 u32 sys_size;
@@ -194,6 +197,42 @@ int main(int argc, char ** argv)
194 buf[0x1f6] = sys_size >> 16; 197 buf[0x1f6] = sys_size >> 16;
195 buf[0x1f7] = sys_size >> 24; 198 buf[0x1f7] = sys_size >> 24;
196 199
200#ifdef CONFIG_EFI_STUB
201 file_sz = sz + i + ((sys_size * 16) - sz);
202
203 pe_header = *(unsigned int *)&buf[0x3c];
204
205 /* Size of code */
206 *(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
207
208 /* Size of image */
209 *(unsigned int *)&buf[pe_header + 0x50] = file_sz;
210
211#ifdef CONFIG_X86_32
212 /* Address of entry point */
213 *(unsigned int *)&buf[pe_header + 0x28] = i;
214
215 /* .text size */
216 *(unsigned int *)&buf[pe_header + 0xb0] = file_sz;
217
218 /* .text size of initialised data */
219 *(unsigned int *)&buf[pe_header + 0xb8] = file_sz;
220#else
221 /*
222 * Address of entry point. startup_32 is at the beginning and
223 * the 64-bit entry point (startup_64) is always 512 bytes
224 * after.
225 */
226 *(unsigned int *)&buf[pe_header + 0x28] = i + 512;
227
228 /* .text size */
229 *(unsigned int *)&buf[pe_header + 0xc0] = file_sz;
230
231 /* .text size of initialised data */
232 *(unsigned int *)&buf[pe_header + 0xc8] = file_sz;
233#endif /* CONFIG_X86_32 */
234#endif /* CONFIG_EFI_STUB */
235
197 crc = partial_crc32(buf, i, crc); 236 crc = partial_crc32(buf, i, crc);
198 if (fwrite(buf, 1, i, stdout) != i) 237 if (fwrite(buf, 1, i, stdout) != i)
199 die("Writing setup failed"); 238 die("Writing setup failed");
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3537d4b91f74..2b0b9631474b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,12 +5,14 @@
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
8obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
8 9
9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 10obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 11obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
11obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 13obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 14obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
15obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 16obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 17obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
16 18
@@ -20,12 +22,14 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
20aes-i586-y := aes-i586-asm_32.o aes_glue.o 22aes-i586-y := aes-i586-asm_32.o aes_glue.o
21twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 23twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
22salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 24salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
25serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
23 26
24aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 27aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
25blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 28blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
26twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 29twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
27twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 30twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
28salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 31salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
32serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
29 33
30aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 34aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
31 35
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 000000000000..4e37677ca851
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,638 @@
1/*
2 * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-i586-asm_32.S"
28.text
29
30#define arg_ctx 4
31#define arg_dst 8
32#define arg_src 12
33#define arg_xor 16
34
35/**********************************************************************
36 4-way SSE2 serpent
37 **********************************************************************/
38#define CTX %edx
39
40#define RA %xmm0
41#define RB %xmm1
42#define RC %xmm2
43#define RD %xmm3
44#define RE %xmm4
45
46#define RT0 %xmm5
47#define RT1 %xmm6
48
49#define RNOT %xmm7
50
51#define get_key(i, j, t) \
52 movd (4*(i)+(j))*4(CTX), t; \
53 pshufd $0, t, t;
54
55#define K(x0, x1, x2, x3, x4, i) \
56 get_key(i, 0, x4); \
57 get_key(i, 1, RT0); \
58 get_key(i, 2, RT1); \
59 pxor x4, x0; \
60 pxor RT0, x1; \
61 pxor RT1, x2; \
62 get_key(i, 3, x4); \
63 pxor x4, x3;
64
65#define LK(x0, x1, x2, x3, x4, i) \
66 movdqa x0, x4; \
67 pslld $13, x0; \
68 psrld $(32 - 13), x4; \
69 por x4, x0; \
70 pxor x0, x1; \
71 movdqa x2, x4; \
72 pslld $3, x2; \
73 psrld $(32 - 3), x4; \
74 por x4, x2; \
75 pxor x2, x1; \
76 movdqa x1, x4; \
77 pslld $1, x1; \
78 psrld $(32 - 1), x4; \
79 por x4, x1; \
80 movdqa x0, x4; \
81 pslld $3, x4; \
82 pxor x2, x3; \
83 pxor x4, x3; \
84 movdqa x3, x4; \
85 pslld $7, x3; \
86 psrld $(32 - 7), x4; \
87 por x4, x3; \
88 movdqa x1, x4; \
89 pslld $7, x4; \
90 pxor x1, x0; \
91 pxor x3, x0; \
92 pxor x3, x2; \
93 pxor x4, x2; \
94 movdqa x0, x4; \
95 get_key(i, 1, RT0); \
96 pxor RT0, x1; \
97 get_key(i, 3, RT0); \
98 pxor RT0, x3; \
99 pslld $5, x0; \
100 psrld $(32 - 5), x4; \
101 por x4, x0; \
102 movdqa x2, x4; \
103 pslld $22, x2; \
104 psrld $(32 - 22), x4; \
105 por x4, x2; \
106 get_key(i, 0, RT0); \
107 pxor RT0, x0; \
108 get_key(i, 2, RT0); \
109 pxor RT0, x2;
110
111#define KL(x0, x1, x2, x3, x4, i) \
112 K(x0, x1, x2, x3, x4, i); \
113 movdqa x0, x4; \
114 psrld $5, x0; \
115 pslld $(32 - 5), x4; \
116 por x4, x0; \
117 movdqa x2, x4; \
118 psrld $22, x2; \
119 pslld $(32 - 22), x4; \
120 por x4, x2; \
121 pxor x3, x2; \
122 pxor x3, x0; \
123 movdqa x1, x4; \
124 pslld $7, x4; \
125 pxor x1, x0; \
126 pxor x4, x2; \
127 movdqa x1, x4; \
128 psrld $1, x1; \
129 pslld $(32 - 1), x4; \
130 por x4, x1; \
131 movdqa x3, x4; \
132 psrld $7, x3; \
133 pslld $(32 - 7), x4; \
134 por x4, x3; \
135 pxor x0, x1; \
136 movdqa x0, x4; \
137 pslld $3, x4; \
138 pxor x4, x3; \
139 movdqa x0, x4; \
140 psrld $13, x0; \
141 pslld $(32 - 13), x4; \
142 por x4, x0; \
143 pxor x2, x1; \
144 pxor x2, x3; \
145 movdqa x2, x4; \
146 psrld $3, x2; \
147 pslld $(32 - 3), x4; \
148 por x4, x2;
149
150#define S0(x0, x1, x2, x3, x4) \
151 movdqa x3, x4; \
152 por x0, x3; \
153 pxor x4, x0; \
154 pxor x2, x4; \
155 pxor RNOT, x4; \
156 pxor x1, x3; \
157 pand x0, x1; \
158 pxor x4, x1; \
159 pxor x0, x2; \
160 pxor x3, x0; \
161 por x0, x4; \
162 pxor x2, x0; \
163 pand x1, x2; \
164 pxor x2, x3; \
165 pxor RNOT, x1; \
166 pxor x4, x2; \
167 pxor x2, x1;
168
169#define S1(x0, x1, x2, x3, x4) \
170 movdqa x1, x4; \
171 pxor x0, x1; \
172 pxor x3, x0; \
173 pxor RNOT, x3; \
174 pand x1, x4; \
175 por x1, x0; \
176 pxor x2, x3; \
177 pxor x3, x0; \
178 pxor x3, x1; \
179 pxor x4, x3; \
180 por x4, x1; \
181 pxor x2, x4; \
182 pand x0, x2; \
183 pxor x1, x2; \
184 por x0, x1; \
185 pxor RNOT, x0; \
186 pxor x2, x0; \
187 pxor x1, x4;
188
189#define S2(x0, x1, x2, x3, x4) \
190 pxor RNOT, x3; \
191 pxor x0, x1; \
192 movdqa x0, x4; \
193 pand x2, x0; \
194 pxor x3, x0; \
195 por x4, x3; \
196 pxor x1, x2; \
197 pxor x1, x3; \
198 pand x0, x1; \
199 pxor x2, x0; \
200 pand x3, x2; \
201 por x1, x3; \
202 pxor RNOT, x0; \
203 pxor x0, x3; \
204 pxor x0, x4; \
205 pxor x2, x0; \
206 por x2, x1;
207
208#define S3(x0, x1, x2, x3, x4) \
209 movdqa x1, x4; \
210 pxor x3, x1; \
211 por x0, x3; \
212 pand x0, x4; \
213 pxor x2, x0; \
214 pxor x1, x2; \
215 pand x3, x1; \
216 pxor x3, x2; \
217 por x4, x0; \
218 pxor x3, x4; \
219 pxor x0, x1; \
220 pand x3, x0; \
221 pand x4, x3; \
222 pxor x2, x3; \
223 por x1, x4; \
224 pand x1, x2; \
225 pxor x3, x4; \
226 pxor x3, x0; \
227 pxor x2, x3;
228
229#define S4(x0, x1, x2, x3, x4) \
230 movdqa x3, x4; \
231 pand x0, x3; \
232 pxor x4, x0; \
233 pxor x2, x3; \
234 por x4, x2; \
235 pxor x1, x0; \
236 pxor x3, x4; \
237 por x0, x2; \
238 pxor x1, x2; \
239 pand x0, x1; \
240 pxor x4, x1; \
241 pand x2, x4; \
242 pxor x3, x2; \
243 pxor x0, x4; \
244 por x1, x3; \
245 pxor RNOT, x1; \
246 pxor x0, x3;
247
248#define S5(x0, x1, x2, x3, x4) \
249 movdqa x1, x4; \
250 por x0, x1; \
251 pxor x1, x2; \
252 pxor RNOT, x3; \
253 pxor x0, x4; \
254 pxor x2, x0; \
255 pand x4, x1; \
256 por x3, x4; \
257 pxor x0, x4; \
258 pand x3, x0; \
259 pxor x3, x1; \
260 pxor x2, x3; \
261 pxor x1, x0; \
262 pand x4, x2; \
263 pxor x2, x1; \
264 pand x0, x2; \
265 pxor x2, x3;
266
267#define S6(x0, x1, x2, x3, x4) \
268 movdqa x1, x4; \
269 pxor x0, x3; \
270 pxor x2, x1; \
271 pxor x0, x2; \
272 pand x3, x0; \
273 por x3, x1; \
274 pxor RNOT, x4; \
275 pxor x1, x0; \
276 pxor x2, x1; \
277 pxor x4, x3; \
278 pxor x0, x4; \
279 pand x0, x2; \
280 pxor x1, x4; \
281 pxor x3, x2; \
282 pand x1, x3; \
283 pxor x0, x3; \
284 pxor x2, x1;
285
286#define S7(x0, x1, x2, x3, x4) \
287 pxor RNOT, x1; \
288 movdqa x1, x4; \
289 pxor RNOT, x0; \
290 pand x2, x1; \
291 pxor x3, x1; \
292 por x4, x3; \
293 pxor x2, x4; \
294 pxor x3, x2; \
295 pxor x0, x3; \
296 por x1, x0; \
297 pand x0, x2; \
298 pxor x4, x0; \
299 pxor x3, x4; \
300 pand x0, x3; \
301 pxor x1, x4; \
302 pxor x4, x2; \
303 pxor x1, x3; \
304 por x0, x4; \
305 pxor x1, x4;
306
307#define SI0(x0, x1, x2, x3, x4) \
308 movdqa x3, x4; \
309 pxor x0, x1; \
310 por x1, x3; \
311 pxor x1, x4; \
312 pxor RNOT, x0; \
313 pxor x3, x2; \
314 pxor x0, x3; \
315 pand x1, x0; \
316 pxor x2, x0; \
317 pand x3, x2; \
318 pxor x4, x3; \
319 pxor x3, x2; \
320 pxor x3, x1; \
321 pand x0, x3; \
322 pxor x0, x1; \
323 pxor x2, x0; \
324 pxor x3, x4;
325
326#define SI1(x0, x1, x2, x3, x4) \
327 pxor x3, x1; \
328 movdqa x0, x4; \
329 pxor x2, x0; \
330 pxor RNOT, x2; \
331 por x1, x4; \
332 pxor x3, x4; \
333 pand x1, x3; \
334 pxor x2, x1; \
335 pand x4, x2; \
336 pxor x1, x4; \
337 por x3, x1; \
338 pxor x0, x3; \
339 pxor x0, x2; \
340 por x4, x0; \
341 pxor x4, x2; \
342 pxor x0, x1; \
343 pxor x1, x4;
344
345#define SI2(x0, x1, x2, x3, x4) \
346 pxor x1, x2; \
347 movdqa x3, x4; \
348 pxor RNOT, x3; \
349 por x2, x3; \
350 pxor x4, x2; \
351 pxor x0, x4; \
352 pxor x1, x3; \
353 por x2, x1; \
354 pxor x0, x2; \
355 pxor x4, x1; \
356 por x3, x4; \
357 pxor x3, x2; \
358 pxor x2, x4; \
359 pand x1, x2; \
360 pxor x3, x2; \
361 pxor x4, x3; \
362 pxor x0, x4;
363
364#define SI3(x0, x1, x2, x3, x4) \
365 pxor x1, x2; \
366 movdqa x1, x4; \
367 pand x2, x1; \
368 pxor x0, x1; \
369 por x4, x0; \
370 pxor x3, x4; \
371 pxor x3, x0; \
372 por x1, x3; \
373 pxor x2, x1; \
374 pxor x3, x1; \
375 pxor x2, x0; \
376 pxor x3, x2; \
377 pand x1, x3; \
378 pxor x0, x1; \
379 pand x2, x0; \
380 pxor x3, x4; \
381 pxor x0, x3; \
382 pxor x1, x0;
383
384#define SI4(x0, x1, x2, x3, x4) \
385 pxor x3, x2; \
386 movdqa x0, x4; \
387 pand x1, x0; \
388 pxor x2, x0; \
389 por x3, x2; \
390 pxor RNOT, x4; \
391 pxor x0, x1; \
392 pxor x2, x0; \
393 pand x4, x2; \
394 pxor x0, x2; \
395 por x4, x0; \
396 pxor x3, x0; \
397 pand x2, x3; \
398 pxor x3, x4; \
399 pxor x1, x3; \
400 pand x0, x1; \
401 pxor x1, x4; \
402 pxor x3, x0;
403
404#define SI5(x0, x1, x2, x3, x4) \
405 movdqa x1, x4; \
406 por x2, x1; \
407 pxor x4, x2; \
408 pxor x3, x1; \
409 pand x4, x3; \
410 pxor x3, x2; \
411 por x0, x3; \
412 pxor RNOT, x0; \
413 pxor x2, x3; \
414 por x0, x2; \
415 pxor x1, x4; \
416 pxor x4, x2; \
417 pand x0, x4; \
418 pxor x1, x0; \
419 pxor x3, x1; \
420 pand x2, x0; \
421 pxor x3, x2; \
422 pxor x2, x0; \
423 pxor x4, x2; \
424 pxor x3, x4;
425
426#define SI6(x0, x1, x2, x3, x4) \
427 pxor x2, x0; \
428 movdqa x0, x4; \
429 pand x3, x0; \
430 pxor x3, x2; \
431 pxor x2, x0; \
432 pxor x1, x3; \
433 por x4, x2; \
434 pxor x3, x2; \
435 pand x0, x3; \
436 pxor RNOT, x0; \
437 pxor x1, x3; \
438 pand x2, x1; \
439 pxor x0, x4; \
440 pxor x4, x3; \
441 pxor x2, x4; \
442 pxor x1, x0; \
443 pxor x0, x2;
444
445#define SI7(x0, x1, x2, x3, x4) \
446 movdqa x3, x4; \
447 pand x0, x3; \
448 pxor x2, x0; \
449 por x4, x2; \
450 pxor x1, x4; \
451 pxor RNOT, x0; \
452 por x3, x1; \
453 pxor x0, x4; \
454 pand x2, x0; \
455 pxor x1, x0; \
456 pand x2, x1; \
457 pxor x2, x3; \
458 pxor x3, x4; \
459 pand x3, x2; \
460 por x0, x3; \
461 pxor x4, x1; \
462 pxor x4, x3; \
463 pand x0, x4; \
464 pxor x2, x4;
465
466#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
467 movdqa x2, t3; \
468 movdqa x0, t1; \
469 unpcklps x3, t3; \
470 movdqa x0, t2; \
471 unpcklps x1, t1; \
472 unpckhps x1, t2; \
473 movdqa t3, x1; \
474 unpckhps x3, x2; \
475 movdqa t1, x0; \
476 movhlps t1, x1; \
477 movdqa t2, t1; \
478 movlhps t3, x0; \
479 movlhps x2, t1; \
480 movhlps t2, x2; \
481 movdqa x2, x3; \
482 movdqa t1, x2;
483
484#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
485 movdqu (0*4*4)(in), x0; \
486 movdqu (1*4*4)(in), x1; \
487 movdqu (2*4*4)(in), x2; \
488 movdqu (3*4*4)(in), x3; \
489 \
490 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
491
492#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
493 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
494 \
495 movdqu x0, (0*4*4)(out); \
496 movdqu x1, (1*4*4)(out); \
497 movdqu x2, (2*4*4)(out); \
498 movdqu x3, (3*4*4)(out);
499
500#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
501 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
502 \
503 movdqu (0*4*4)(out), t0; \
504 pxor t0, x0; \
505 movdqu x0, (0*4*4)(out); \
506 movdqu (1*4*4)(out), t0; \
507 pxor t0, x1; \
508 movdqu x1, (1*4*4)(out); \
509 movdqu (2*4*4)(out), t0; \
510 pxor t0, x2; \
511 movdqu x2, (2*4*4)(out); \
512 movdqu (3*4*4)(out), t0; \
513 pxor t0, x3; \
514 movdqu x3, (3*4*4)(out);
515
516.align 8
517.global __serpent_enc_blk_4way
518.type __serpent_enc_blk_4way,@function;
519
520__serpent_enc_blk_4way:
521 /* input:
522 * arg_ctx(%esp): ctx, CTX
523 * arg_dst(%esp): dst
524 * arg_src(%esp): src
525 * arg_xor(%esp): bool, if true: xor output
526 */
527
528 pcmpeqd RNOT, RNOT;
529
530 movl arg_ctx(%esp), CTX;
531
532 movl arg_src(%esp), %eax;
533 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
534
535 K(RA, RB, RC, RD, RE, 0);
536 S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
537 S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
538 S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
539 S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
540 S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
541 S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
542 S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
543 S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
544 S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
545 S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
546 S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
547 S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
548 S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
549 S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
550 S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
551 S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
552 S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
553 S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
554 S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
555 S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
556 S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
557 S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
558 S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
559 S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
560 S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
561 S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
562 S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
563 S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
564 S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
565 S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
566 S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
567 S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
568
569 movl arg_dst(%esp), %eax;
570
571 cmpb $0, arg_xor(%esp);
572 jnz __enc_xor4;
573
574 write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
575
576 ret;
577
578__enc_xor4:
579 xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
580
581 ret;
582
583.align 8
584.global serpent_dec_blk_4way
585.type serpent_dec_blk_4way,@function;
586
587serpent_dec_blk_4way:
588 /* input:
589 * arg_ctx(%esp): ctx, CTX
590 * arg_dst(%esp): dst
591 * arg_src(%esp): src
592 */
593
594 pcmpeqd RNOT, RNOT;
595
596 movl arg_ctx(%esp), CTX;
597
598 movl arg_src(%esp), %eax;
599 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
600
601 K(RA, RB, RC, RD, RE, 32);
602 SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
603 SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
604 SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
605 SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
606 SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
607 SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
608 SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
609 SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
610 SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
611 SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
612 SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
613 SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
614 SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
615 SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
616 SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
617 SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
618 SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
619 SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
620 SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
621 SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
622 SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
623 SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
624 SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
625 SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
626 SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
627 SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
628 SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
629 SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
630 SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
631 SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
632 SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
633 SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
634
635 movl arg_dst(%esp), %eax;
636 write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
637
638 ret;
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 000000000000..7f24a1540821
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,761 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way SSE2 serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define RA2 %xmm5
42#define RB2 %xmm6
43#define RC2 %xmm7
44#define RD2 %xmm8
45#define RE2 %xmm9
46
47#define RNOT %xmm10
48
49#define RK0 %xmm11
50#define RK1 %xmm12
51#define RK2 %xmm13
52#define RK3 %xmm14
53
54#define S0_1(x0, x1, x2, x3, x4) \
55 movdqa x3, x4; \
56 por x0, x3; \
57 pxor x4, x0; \
58 pxor x2, x4; \
59 pxor RNOT, x4; \
60 pxor x1, x3; \
61 pand x0, x1; \
62 pxor x4, x1; \
63 pxor x0, x2;
64#define S0_2(x0, x1, x2, x3, x4) \
65 pxor x3, x0; \
66 por x0, x4; \
67 pxor x2, x0; \
68 pand x1, x2; \
69 pxor x2, x3; \
70 pxor RNOT, x1; \
71 pxor x4, x2; \
72 pxor x2, x1;
73
74#define S1_1(x0, x1, x2, x3, x4) \
75 movdqa x1, x4; \
76 pxor x0, x1; \
77 pxor x3, x0; \
78 pxor RNOT, x3; \
79 pand x1, x4; \
80 por x1, x0; \
81 pxor x2, x3; \
82 pxor x3, x0; \
83 pxor x3, x1;
84#define S1_2(x0, x1, x2, x3, x4) \
85 pxor x4, x3; \
86 por x4, x1; \
87 pxor x2, x4; \
88 pand x0, x2; \
89 pxor x1, x2; \
90 por x0, x1; \
91 pxor RNOT, x0; \
92 pxor x2, x0; \
93 pxor x1, x4;
94
95#define S2_1(x0, x1, x2, x3, x4) \
96 pxor RNOT, x3; \
97 pxor x0, x1; \
98 movdqa x0, x4; \
99 pand x2, x0; \
100 pxor x3, x0; \
101 por x4, x3; \
102 pxor x1, x2; \
103 pxor x1, x3; \
104 pand x0, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 pxor x2, x0; \
107 pand x3, x2; \
108 por x1, x3; \
109 pxor RNOT, x0; \
110 pxor x0, x3; \
111 pxor x0, x4; \
112 pxor x2, x0; \
113 por x2, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 movdqa x1, x4; \
117 pxor x3, x1; \
118 por x0, x3; \
119 pand x0, x4; \
120 pxor x2, x0; \
121 pxor x1, x2; \
122 pand x3, x1; \
123 pxor x3, x2; \
124 por x4, x0; \
125 pxor x3, x4;
126#define S3_2(x0, x1, x2, x3, x4) \
127 pxor x0, x1; \
128 pand x3, x0; \
129 pand x4, x3; \
130 pxor x2, x3; \
131 por x1, x4; \
132 pand x1, x2; \
133 pxor x3, x4; \
134 pxor x3, x0; \
135 pxor x2, x3;
136
137#define S4_1(x0, x1, x2, x3, x4) \
138 movdqa x3, x4; \
139 pand x0, x3; \
140 pxor x4, x0; \
141 pxor x2, x3; \
142 por x4, x2; \
143 pxor x1, x0; \
144 pxor x3, x4; \
145 por x0, x2; \
146 pxor x1, x2;
147#define S4_2(x0, x1, x2, x3, x4) \
148 pand x0, x1; \
149 pxor x4, x1; \
150 pand x2, x4; \
151 pxor x3, x2; \
152 pxor x0, x4; \
153 por x1, x3; \
154 pxor RNOT, x1; \
155 pxor x0, x3;
156
157#define S5_1(x0, x1, x2, x3, x4) \
158 movdqa x1, x4; \
159 por x0, x1; \
160 pxor x1, x2; \
161 pxor RNOT, x3; \
162 pxor x0, x4; \
163 pxor x2, x0; \
164 pand x4, x1; \
165 por x3, x4; \
166 pxor x0, x4;
167#define S5_2(x0, x1, x2, x3, x4) \
168 pand x3, x0; \
169 pxor x3, x1; \
170 pxor x2, x3; \
171 pxor x1, x0; \
172 pand x4, x2; \
173 pxor x2, x1; \
174 pand x0, x2; \
175 pxor x2, x3;
176
177#define S6_1(x0, x1, x2, x3, x4) \
178 movdqa x1, x4; \
179 pxor x0, x3; \
180 pxor x2, x1; \
181 pxor x0, x2; \
182 pand x3, x0; \
183 por x3, x1; \
184 pxor RNOT, x4; \
185 pxor x1, x0; \
186 pxor x2, x1;
187#define S6_2(x0, x1, x2, x3, x4) \
188 pxor x4, x3; \
189 pxor x0, x4; \
190 pand x0, x2; \
191 pxor x1, x4; \
192 pxor x3, x2; \
193 pand x1, x3; \
194 pxor x0, x3; \
195 pxor x2, x1;
196
197#define S7_1(x0, x1, x2, x3, x4) \
198 pxor RNOT, x1; \
199 movdqa x1, x4; \
200 pxor RNOT, x0; \
201 pand x2, x1; \
202 pxor x3, x1; \
203 por x4, x3; \
204 pxor x2, x4; \
205 pxor x3, x2; \
206 pxor x0, x3; \
207 por x1, x0;
208#define S7_2(x0, x1, x2, x3, x4) \
209 pand x0, x2; \
210 pxor x4, x0; \
211 pxor x3, x4; \
212 pand x0, x3; \
213 pxor x1, x4; \
214 pxor x4, x2; \
215 pxor x1, x3; \
216 por x0, x4; \
217 pxor x1, x4;
218
219#define SI0_1(x0, x1, x2, x3, x4) \
220 movdqa x3, x4; \
221 pxor x0, x1; \
222 por x1, x3; \
223 pxor x1, x4; \
224 pxor RNOT, x0; \
225 pxor x3, x2; \
226 pxor x0, x3; \
227 pand x1, x0; \
228 pxor x2, x0;
229#define SI0_2(x0, x1, x2, x3, x4) \
230 pand x3, x2; \
231 pxor x4, x3; \
232 pxor x3, x2; \
233 pxor x3, x1; \
234 pand x0, x3; \
235 pxor x0, x1; \
236 pxor x2, x0; \
237 pxor x3, x4;
238
239#define SI1_1(x0, x1, x2, x3, x4) \
240 pxor x3, x1; \
241 movdqa x0, x4; \
242 pxor x2, x0; \
243 pxor RNOT, x2; \
244 por x1, x4; \
245 pxor x3, x4; \
246 pand x1, x3; \
247 pxor x2, x1; \
248 pand x4, x2;
249#define SI1_2(x0, x1, x2, x3, x4) \
250 pxor x1, x4; \
251 por x3, x1; \
252 pxor x0, x3; \
253 pxor x0, x2; \
254 por x4, x0; \
255 pxor x4, x2; \
256 pxor x0, x1; \
257 pxor x1, x4;
258
259#define SI2_1(x0, x1, x2, x3, x4) \
260 pxor x1, x2; \
261 movdqa x3, x4; \
262 pxor RNOT, x3; \
263 por x2, x3; \
264 pxor x4, x2; \
265 pxor x0, x4; \
266 pxor x1, x3; \
267 por x2, x1; \
268 pxor x0, x2;
269#define SI2_2(x0, x1, x2, x3, x4) \
270 pxor x4, x1; \
271 por x3, x4; \
272 pxor x3, x2; \
273 pxor x2, x4; \
274 pand x1, x2; \
275 pxor x3, x2; \
276 pxor x4, x3; \
277 pxor x0, x4;
278
279#define SI3_1(x0, x1, x2, x3, x4) \
280 pxor x1, x2; \
281 movdqa x1, x4; \
282 pand x2, x1; \
283 pxor x0, x1; \
284 por x4, x0; \
285 pxor x3, x4; \
286 pxor x3, x0; \
287 por x1, x3; \
288 pxor x2, x1;
289#define SI3_2(x0, x1, x2, x3, x4) \
290 pxor x3, x1; \
291 pxor x2, x0; \
292 pxor x3, x2; \
293 pand x1, x3; \
294 pxor x0, x1; \
295 pand x2, x0; \
296 pxor x3, x4; \
297 pxor x0, x3; \
298 pxor x1, x0;
299
300#define SI4_1(x0, x1, x2, x3, x4) \
301 pxor x3, x2; \
302 movdqa x0, x4; \
303 pand x1, x0; \
304 pxor x2, x0; \
305 por x3, x2; \
306 pxor RNOT, x4; \
307 pxor x0, x1; \
308 pxor x2, x0; \
309 pand x4, x2;
310#define SI4_2(x0, x1, x2, x3, x4) \
311 pxor x0, x2; \
312 por x4, x0; \
313 pxor x3, x0; \
314 pand x2, x3; \
315 pxor x3, x4; \
316 pxor x1, x3; \
317 pand x0, x1; \
318 pxor x1, x4; \
319 pxor x3, x0;
320
321#define SI5_1(x0, x1, x2, x3, x4) \
322 movdqa x1, x4; \
323 por x2, x1; \
324 pxor x4, x2; \
325 pxor x3, x1; \
326 pand x4, x3; \
327 pxor x3, x2; \
328 por x0, x3; \
329 pxor RNOT, x0; \
330 pxor x2, x3; \
331 por x0, x2;
332#define SI5_2(x0, x1, x2, x3, x4) \
333 pxor x1, x4; \
334 pxor x4, x2; \
335 pand x0, x4; \
336 pxor x1, x0; \
337 pxor x3, x1; \
338 pand x2, x0; \
339 pxor x3, x2; \
340 pxor x2, x0; \
341 pxor x4, x2; \
342 pxor x3, x4;
343
344#define SI6_1(x0, x1, x2, x3, x4) \
345 pxor x2, x0; \
346 movdqa x0, x4; \
347 pand x3, x0; \
348 pxor x3, x2; \
349 pxor x2, x0; \
350 pxor x1, x3; \
351 por x4, x2; \
352 pxor x3, x2; \
353 pand x0, x3;
354#define SI6_2(x0, x1, x2, x3, x4) \
355 pxor RNOT, x0; \
356 pxor x1, x3; \
357 pand x2, x1; \
358 pxor x0, x4; \
359 pxor x4, x3; \
360 pxor x2, x4; \
361 pxor x1, x0; \
362 pxor x0, x2;
363
364#define SI7_1(x0, x1, x2, x3, x4) \
365 movdqa x3, x4; \
366 pand x0, x3; \
367 pxor x2, x0; \
368 por x4, x2; \
369 pxor x1, x4; \
370 pxor RNOT, x0; \
371 por x3, x1; \
372 pxor x0, x4; \
373 pand x2, x0; \
374 pxor x1, x0;
375#define SI7_2(x0, x1, x2, x3, x4) \
376 pand x2, x1; \
377 pxor x2, x3; \
378 pxor x3, x4; \
379 pand x3, x2; \
380 por x0, x3; \
381 pxor x4, x1; \
382 pxor x4, x3; \
383 pand x0, x4; \
384 pxor x2, x4;
385
386#define get_key(i, j, t) \
387 movd (4*(i)+(j))*4(CTX), t; \
388 pshufd $0, t, t;
389
390#define K2(x0, x1, x2, x3, x4, i) \
391 get_key(i, 0, RK0); \
392 get_key(i, 1, RK1); \
393 get_key(i, 2, RK2); \
394 get_key(i, 3, RK3); \
395 pxor RK0, x0 ## 1; \
396 pxor RK1, x1 ## 1; \
397 pxor RK2, x2 ## 1; \
398 pxor RK3, x3 ## 1; \
399 pxor RK0, x0 ## 2; \
400 pxor RK1, x1 ## 2; \
401 pxor RK2, x2 ## 2; \
402 pxor RK3, x3 ## 2;
403
404#define LK2(x0, x1, x2, x3, x4, i) \
405 movdqa x0 ## 1, x4 ## 1; \
406 pslld $13, x0 ## 1; \
407 psrld $(32 - 13), x4 ## 1; \
408 por x4 ## 1, x0 ## 1; \
409 pxor x0 ## 1, x1 ## 1; \
410 movdqa x2 ## 1, x4 ## 1; \
411 pslld $3, x2 ## 1; \
412 psrld $(32 - 3), x4 ## 1; \
413 por x4 ## 1, x2 ## 1; \
414 pxor x2 ## 1, x1 ## 1; \
415 movdqa x0 ## 2, x4 ## 2; \
416 pslld $13, x0 ## 2; \
417 psrld $(32 - 13), x4 ## 2; \
418 por x4 ## 2, x0 ## 2; \
419 pxor x0 ## 2, x1 ## 2; \
420 movdqa x2 ## 2, x4 ## 2; \
421 pslld $3, x2 ## 2; \
422 psrld $(32 - 3), x4 ## 2; \
423 por x4 ## 2, x2 ## 2; \
424 pxor x2 ## 2, x1 ## 2; \
425 movdqa x1 ## 1, x4 ## 1; \
426 pslld $1, x1 ## 1; \
427 psrld $(32 - 1), x4 ## 1; \
428 por x4 ## 1, x1 ## 1; \
429 movdqa x0 ## 1, x4 ## 1; \
430 pslld $3, x4 ## 1; \
431 pxor x2 ## 1, x3 ## 1; \
432 pxor x4 ## 1, x3 ## 1; \
433 movdqa x3 ## 1, x4 ## 1; \
434 get_key(i, 1, RK1); \
435 movdqa x1 ## 2, x4 ## 2; \
436 pslld $1, x1 ## 2; \
437 psrld $(32 - 1), x4 ## 2; \
438 por x4 ## 2, x1 ## 2; \
439 movdqa x0 ## 2, x4 ## 2; \
440 pslld $3, x4 ## 2; \
441 pxor x2 ## 2, x3 ## 2; \
442 pxor x4 ## 2, x3 ## 2; \
443 movdqa x3 ## 2, x4 ## 2; \
444 get_key(i, 3, RK3); \
445 pslld $7, x3 ## 1; \
446 psrld $(32 - 7), x4 ## 1; \
447 por x4 ## 1, x3 ## 1; \
448 movdqa x1 ## 1, x4 ## 1; \
449 pslld $7, x4 ## 1; \
450 pxor x1 ## 1, x0 ## 1; \
451 pxor x3 ## 1, x0 ## 1; \
452 pxor x3 ## 1, x2 ## 1; \
453 pxor x4 ## 1, x2 ## 1; \
454 get_key(i, 0, RK0); \
455 pslld $7, x3 ## 2; \
456 psrld $(32 - 7), x4 ## 2; \
457 por x4 ## 2, x3 ## 2; \
458 movdqa x1 ## 2, x4 ## 2; \
459 pslld $7, x4 ## 2; \
460 pxor x1 ## 2, x0 ## 2; \
461 pxor x3 ## 2, x0 ## 2; \
462 pxor x3 ## 2, x2 ## 2; \
463 pxor x4 ## 2, x2 ## 2; \
464 get_key(i, 2, RK2); \
465 pxor RK1, x1 ## 1; \
466 pxor RK3, x3 ## 1; \
467 movdqa x0 ## 1, x4 ## 1; \
468 pslld $5, x0 ## 1; \
469 psrld $(32 - 5), x4 ## 1; \
470 por x4 ## 1, x0 ## 1; \
471 movdqa x2 ## 1, x4 ## 1; \
472 pslld $22, x2 ## 1; \
473 psrld $(32 - 22), x4 ## 1; \
474 por x4 ## 1, x2 ## 1; \
475 pxor RK0, x0 ## 1; \
476 pxor RK2, x2 ## 1; \
477 pxor RK1, x1 ## 2; \
478 pxor RK3, x3 ## 2; \
479 movdqa x0 ## 2, x4 ## 2; \
480 pslld $5, x0 ## 2; \
481 psrld $(32 - 5), x4 ## 2; \
482 por x4 ## 2, x0 ## 2; \
483 movdqa x2 ## 2, x4 ## 2; \
484 pslld $22, x2 ## 2; \
485 psrld $(32 - 22), x4 ## 2; \
486 por x4 ## 2, x2 ## 2; \
487 pxor RK0, x0 ## 2; \
488 pxor RK2, x2 ## 2;
489
490#define KL2(x0, x1, x2, x3, x4, i) \
491 pxor RK0, x0 ## 1; \
492 pxor RK2, x2 ## 1; \
493 movdqa x0 ## 1, x4 ## 1; \
494 psrld $5, x0 ## 1; \
495 pslld $(32 - 5), x4 ## 1; \
496 por x4 ## 1, x0 ## 1; \
497 pxor RK3, x3 ## 1; \
498 pxor RK1, x1 ## 1; \
499 movdqa x2 ## 1, x4 ## 1; \
500 psrld $22, x2 ## 1; \
501 pslld $(32 - 22), x4 ## 1; \
502 por x4 ## 1, x2 ## 1; \
503 pxor x3 ## 1, x2 ## 1; \
504 pxor RK0, x0 ## 2; \
505 pxor RK2, x2 ## 2; \
506 movdqa x0 ## 2, x4 ## 2; \
507 psrld $5, x0 ## 2; \
508 pslld $(32 - 5), x4 ## 2; \
509 por x4 ## 2, x0 ## 2; \
510 pxor RK3, x3 ## 2; \
511 pxor RK1, x1 ## 2; \
512 movdqa x2 ## 2, x4 ## 2; \
513 psrld $22, x2 ## 2; \
514 pslld $(32 - 22), x4 ## 2; \
515 por x4 ## 2, x2 ## 2; \
516 pxor x3 ## 2, x2 ## 2; \
517 pxor x3 ## 1, x0 ## 1; \
518 movdqa x1 ## 1, x4 ## 1; \
519 pslld $7, x4 ## 1; \
520 pxor x1 ## 1, x0 ## 1; \
521 pxor x4 ## 1, x2 ## 1; \
522 movdqa x1 ## 1, x4 ## 1; \
523 psrld $1, x1 ## 1; \
524 pslld $(32 - 1), x4 ## 1; \
525 por x4 ## 1, x1 ## 1; \
526 pxor x3 ## 2, x0 ## 2; \
527 movdqa x1 ## 2, x4 ## 2; \
528 pslld $7, x4 ## 2; \
529 pxor x1 ## 2, x0 ## 2; \
530 pxor x4 ## 2, x2 ## 2; \
531 movdqa x1 ## 2, x4 ## 2; \
532 psrld $1, x1 ## 2; \
533 pslld $(32 - 1), x4 ## 2; \
534 por x4 ## 2, x1 ## 2; \
535 movdqa x3 ## 1, x4 ## 1; \
536 psrld $7, x3 ## 1; \
537 pslld $(32 - 7), x4 ## 1; \
538 por x4 ## 1, x3 ## 1; \
539 pxor x0 ## 1, x1 ## 1; \
540 movdqa x0 ## 1, x4 ## 1; \
541 pslld $3, x4 ## 1; \
542 pxor x4 ## 1, x3 ## 1; \
543 movdqa x0 ## 1, x4 ## 1; \
544 movdqa x3 ## 2, x4 ## 2; \
545 psrld $7, x3 ## 2; \
546 pslld $(32 - 7), x4 ## 2; \
547 por x4 ## 2, x3 ## 2; \
548 pxor x0 ## 2, x1 ## 2; \
549 movdqa x0 ## 2, x4 ## 2; \
550 pslld $3, x4 ## 2; \
551 pxor x4 ## 2, x3 ## 2; \
552 movdqa x0 ## 2, x4 ## 2; \
553 psrld $13, x0 ## 1; \
554 pslld $(32 - 13), x4 ## 1; \
555 por x4 ## 1, x0 ## 1; \
556 pxor x2 ## 1, x1 ## 1; \
557 pxor x2 ## 1, x3 ## 1; \
558 movdqa x2 ## 1, x4 ## 1; \
559 psrld $3, x2 ## 1; \
560 pslld $(32 - 3), x4 ## 1; \
561 por x4 ## 1, x2 ## 1; \
562 psrld $13, x0 ## 2; \
563 pslld $(32 - 13), x4 ## 2; \
564 por x4 ## 2, x0 ## 2; \
565 pxor x2 ## 2, x1 ## 2; \
566 pxor x2 ## 2, x3 ## 2; \
567 movdqa x2 ## 2, x4 ## 2; \
568 psrld $3, x2 ## 2; \
569 pslld $(32 - 3), x4 ## 2; \
570 por x4 ## 2, x2 ## 2;
571
572#define S(SBOX, x0, x1, x2, x3, x4) \
573 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
574 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
575 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
576 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
577
578#define SP(SBOX, x0, x1, x2, x3, x4, i) \
579 get_key(i, 0, RK0); \
580 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
581 get_key(i, 2, RK2); \
582 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
583 get_key(i, 3, RK3); \
584 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
585 get_key(i, 1, RK1); \
586 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
587
588#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
589 movdqa x2, t3; \
590 movdqa x0, t1; \
591 unpcklps x3, t3; \
592 movdqa x0, t2; \
593 unpcklps x1, t1; \
594 unpckhps x1, t2; \
595 movdqa t3, x1; \
596 unpckhps x3, x2; \
597 movdqa t1, x0; \
598 movhlps t1, x1; \
599 movdqa t2, t1; \
600 movlhps t3, x0; \
601 movlhps x2, t1; \
602 movhlps t2, x2; \
603 movdqa x2, x3; \
604 movdqa t1, x2;
605
606#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
607 movdqu (0*4*4)(in), x0; \
608 movdqu (1*4*4)(in), x1; \
609 movdqu (2*4*4)(in), x2; \
610 movdqu (3*4*4)(in), x3; \
611 \
612 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
613
614#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
615 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
616 \
617 movdqu x0, (0*4*4)(out); \
618 movdqu x1, (1*4*4)(out); \
619 movdqu x2, (2*4*4)(out); \
620 movdqu x3, (3*4*4)(out);
621
622#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
623 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
624 \
625 movdqu (0*4*4)(out), t0; \
626 pxor t0, x0; \
627 movdqu x0, (0*4*4)(out); \
628 movdqu (1*4*4)(out), t0; \
629 pxor t0, x1; \
630 movdqu x1, (1*4*4)(out); \
631 movdqu (2*4*4)(out), t0; \
632 pxor t0, x2; \
633 movdqu x2, (2*4*4)(out); \
634 movdqu (3*4*4)(out), t0; \
635 pxor t0, x3; \
636 movdqu x3, (3*4*4)(out);
637
638.align 8
639.global __serpent_enc_blk_8way
640.type __serpent_enc_blk_8way,@function;
641
642__serpent_enc_blk_8way:
643 /* input:
644 * %rdi: ctx, CTX
645 * %rsi: dst
646 * %rdx: src
647 * %rcx: bool, if true: xor output
648 */
649
650 pcmpeqd RNOT, RNOT;
651
652 leaq (4*4*4)(%rdx), %rax;
653 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
654 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
655
656 K2(RA, RB, RC, RD, RE, 0);
657 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
658 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
659 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
660 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
661 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
662 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
663 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
664 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
665 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
666 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
667 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
668 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
669 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
670 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
671 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
672 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
673 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
674 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
675 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
676 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
677 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
678 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
679 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
680 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
681 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
682 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
683 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
684 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
685 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
686 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
687 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
688 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
689
690 leaq (4*4*4)(%rsi), %rax;
691
692 testb %cl, %cl;
693 jnz __enc_xor8;
694
695 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
696 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
697
698 ret;
699
700__enc_xor8:
701 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
702 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
703
704 ret;
705
706.align 8
707.global serpent_dec_blk_8way
708.type serpent_dec_blk_8way,@function;
709
710serpent_dec_blk_8way:
711 /* input:
712 * %rdi: ctx, CTX
713 * %rsi: dst
714 * %rdx: src
715 */
716
717 pcmpeqd RNOT, RNOT;
718
719 leaq (4*4*4)(%rdx), %rax;
720 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
721 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
722
723 K2(RA, RB, RC, RD, RE, 32);
724 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
725 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
726 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
727 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
728 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
729 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
730 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
731 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
732 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
733 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
734 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
735 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
736 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
737 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
738 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
739 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
740 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
741 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
742 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
743 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
744 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
745 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
746 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
747 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
748 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
749 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
750 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
751 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
752 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
753 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
754 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
755 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
756
757 leaq (4*4*4)(%rsi), %rax;
758 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
759 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
760
761 ret;
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 000000000000..7955a9b76b91
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,1070 @@
1/*
2 * Glue Code for SSE2 assembler versions of Serpent Cipher
3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Glue code based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
11 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
12 * CTR part based on code (crypto/ctr.c) by:
13 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
28 * USA
29 *
30 */
31
32#include <linux/module.h>
33#include <linux/hardirq.h>
34#include <linux/types.h>
35#include <linux/crypto.h>
36#include <linux/err.h>
37#include <crypto/algapi.h>
38#include <crypto/serpent.h>
39#include <crypto/cryptd.h>
40#include <crypto/b128ops.h>
41#include <crypto/ctr.h>
42#include <crypto/lrw.h>
43#include <crypto/xts.h>
44#include <asm/i387.h>
45#include <asm/serpent.h>
46#include <crypto/scatterwalk.h>
47#include <linux/workqueue.h>
48#include <linux/spinlock.h>
49
50struct async_serpent_ctx {
51 struct cryptd_ablkcipher *cryptd_tfm;
52};
53
54static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
55{
56 if (fpu_enabled)
57 return true;
58
59 /* SSE2 is only used when chunk to be processed is large enough, so
60 * do not enable FPU until it is necessary.
61 */
62 if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
63 return false;
64
65 kernel_fpu_begin();
66 return true;
67}
68
69static inline void serpent_fpu_end(bool fpu_enabled)
70{
71 if (fpu_enabled)
72 kernel_fpu_end();
73}
74
75static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
76 bool enc)
77{
78 bool fpu_enabled = false;
79 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = SERPENT_BLOCK_SIZE;
81 unsigned int nbytes;
82 int err;
83
84 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86
87 while ((nbytes = walk->nbytes)) {
88 u8 *wsrc = walk->src.virt.addr;
89 u8 *wdst = walk->dst.virt.addr;
90
91 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
92
93 /* Process multi-block batch */
94 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
95 do {
96 if (enc)
97 serpent_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 serpent_dec_blk_xway(ctx, wdst, wsrc);
100
101 wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
102 wdst += bsize * SERPENT_PARALLEL_BLOCKS;
103 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
104 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
105
106 if (nbytes < bsize)
107 goto done;
108 }
109
110 /* Handle leftovers */
111 do {
112 if (enc)
113 __serpent_encrypt(ctx, wdst, wsrc);
114 else
115 __serpent_decrypt(ctx, wdst, wsrc);
116
117 wsrc += bsize;
118 wdst += bsize;
119 nbytes -= bsize;
120 } while (nbytes >= bsize);
121
122done:
123 err = blkcipher_walk_done(desc, walk, nbytes);
124 }
125
126 serpent_fpu_end(fpu_enabled);
127 return err;
128}
129
130static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
131 struct scatterlist *src, unsigned int nbytes)
132{
133 struct blkcipher_walk walk;
134
135 blkcipher_walk_init(&walk, dst, src, nbytes);
136 return ecb_crypt(desc, &walk, true);
137}
138
139static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
140 struct scatterlist *src, unsigned int nbytes)
141{
142 struct blkcipher_walk walk;
143
144 blkcipher_walk_init(&walk, dst, src, nbytes);
145 return ecb_crypt(desc, &walk, false);
146}
147
148static struct crypto_alg blk_ecb_alg = {
149 .cra_name = "__ecb-serpent-sse2",
150 .cra_driver_name = "__driver-ecb-serpent-sse2",
151 .cra_priority = 0,
152 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
153 .cra_blocksize = SERPENT_BLOCK_SIZE,
154 .cra_ctxsize = sizeof(struct serpent_ctx),
155 .cra_alignmask = 0,
156 .cra_type = &crypto_blkcipher_type,
157 .cra_module = THIS_MODULE,
158 .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
159 .cra_u = {
160 .blkcipher = {
161 .min_keysize = SERPENT_MIN_KEY_SIZE,
162 .max_keysize = SERPENT_MAX_KEY_SIZE,
163 .setkey = serpent_setkey,
164 .encrypt = ecb_encrypt,
165 .decrypt = ecb_decrypt,
166 },
167 },
168};
169
170static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
171 struct blkcipher_walk *walk)
172{
173 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
174 const unsigned int bsize = SERPENT_BLOCK_SIZE;
175 unsigned int nbytes = walk->nbytes;
176 u128 *src = (u128 *)walk->src.virt.addr;
177 u128 *dst = (u128 *)walk->dst.virt.addr;
178 u128 *iv = (u128 *)walk->iv;
179
180 do {
181 u128_xor(dst, src, iv);
182 __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
183 iv = dst;
184
185 src += 1;
186 dst += 1;
187 nbytes -= bsize;
188 } while (nbytes >= bsize);
189
190 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
191 return nbytes;
192}
193
194static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
195 struct scatterlist *src, unsigned int nbytes)
196{
197 struct blkcipher_walk walk;
198 int err;
199
200 blkcipher_walk_init(&walk, dst, src, nbytes);
201 err = blkcipher_walk_virt(desc, &walk);
202
203 while ((nbytes = walk.nbytes)) {
204 nbytes = __cbc_encrypt(desc, &walk);
205 err = blkcipher_walk_done(desc, &walk, nbytes);
206 }
207
208 return err;
209}
210
211static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
212 struct blkcipher_walk *walk)
213{
214 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
215 const unsigned int bsize = SERPENT_BLOCK_SIZE;
216 unsigned int nbytes = walk->nbytes;
217 u128 *src = (u128 *)walk->src.virt.addr;
218 u128 *dst = (u128 *)walk->dst.virt.addr;
219 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
220 u128 last_iv;
221 int i;
222
223 /* Start of the last block. */
224 src += nbytes / bsize - 1;
225 dst += nbytes / bsize - 1;
226
227 last_iv = *src;
228
229 /* Process multi-block batch */
230 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
231 do {
232 nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
233 src -= SERPENT_PARALLEL_BLOCKS - 1;
234 dst -= SERPENT_PARALLEL_BLOCKS - 1;
235
236 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
237 ivs[i] = src[i];
238
239 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
240
241 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
242 u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
243
244 nbytes -= bsize;
245 if (nbytes < bsize)
246 goto done;
247
248 u128_xor(dst, dst, src - 1);
249 src -= 1;
250 dst -= 1;
251 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
252
253 if (nbytes < bsize)
254 goto done;
255 }
256
257 /* Handle leftovers */
258 for (;;) {
259 __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
260
261 nbytes -= bsize;
262 if (nbytes < bsize)
263 break;
264
265 u128_xor(dst, dst, src - 1);
266 src -= 1;
267 dst -= 1;
268 }
269
270done:
271 u128_xor(dst, dst, (u128 *)walk->iv);
272 *(u128 *)walk->iv = last_iv;
273
274 return nbytes;
275}
276
277static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
278 struct scatterlist *src, unsigned int nbytes)
279{
280 bool fpu_enabled = false;
281 struct blkcipher_walk walk;
282 int err;
283
284 blkcipher_walk_init(&walk, dst, src, nbytes);
285 err = blkcipher_walk_virt(desc, &walk);
286 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
287
288 while ((nbytes = walk.nbytes)) {
289 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
290 nbytes = __cbc_decrypt(desc, &walk);
291 err = blkcipher_walk_done(desc, &walk, nbytes);
292 }
293
294 serpent_fpu_end(fpu_enabled);
295 return err;
296}
297
298static struct crypto_alg blk_cbc_alg = {
299 .cra_name = "__cbc-serpent-sse2",
300 .cra_driver_name = "__driver-cbc-serpent-sse2",
301 .cra_priority = 0,
302 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
303 .cra_blocksize = SERPENT_BLOCK_SIZE,
304 .cra_ctxsize = sizeof(struct serpent_ctx),
305 .cra_alignmask = 0,
306 .cra_type = &crypto_blkcipher_type,
307 .cra_module = THIS_MODULE,
308 .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
309 .cra_u = {
310 .blkcipher = {
311 .min_keysize = SERPENT_MIN_KEY_SIZE,
312 .max_keysize = SERPENT_MAX_KEY_SIZE,
313 .setkey = serpent_setkey,
314 .encrypt = cbc_encrypt,
315 .decrypt = cbc_decrypt,
316 },
317 },
318};
319
320static inline void u128_to_be128(be128 *dst, const u128 *src)
321{
322 dst->a = cpu_to_be64(src->a);
323 dst->b = cpu_to_be64(src->b);
324}
325
326static inline void be128_to_u128(u128 *dst, const be128 *src)
327{
328 dst->a = be64_to_cpu(src->a);
329 dst->b = be64_to_cpu(src->b);
330}
331
332static inline void u128_inc(u128 *i)
333{
334 i->b++;
335 if (!i->b)
336 i->a++;
337}
338
339static void ctr_crypt_final(struct blkcipher_desc *desc,
340 struct blkcipher_walk *walk)
341{
342 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
343 u8 *ctrblk = walk->iv;
344 u8 keystream[SERPENT_BLOCK_SIZE];
345 u8 *src = walk->src.virt.addr;
346 u8 *dst = walk->dst.virt.addr;
347 unsigned int nbytes = walk->nbytes;
348
349 __serpent_encrypt(ctx, keystream, ctrblk);
350 crypto_xor(keystream, src, nbytes);
351 memcpy(dst, keystream, nbytes);
352
353 crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
354}
355
356static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
357 struct blkcipher_walk *walk)
358{
359 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
360 const unsigned int bsize = SERPENT_BLOCK_SIZE;
361 unsigned int nbytes = walk->nbytes;
362 u128 *src = (u128 *)walk->src.virt.addr;
363 u128 *dst = (u128 *)walk->dst.virt.addr;
364 u128 ctrblk;
365 be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
366 int i;
367
368 be128_to_u128(&ctrblk, (be128 *)walk->iv);
369
370 /* Process multi-block batch */
371 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
372 do {
373 /* create ctrblks for parallel encrypt */
374 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
375 if (dst != src)
376 dst[i] = src[i];
377
378 u128_to_be128(&ctrblocks[i], &ctrblk);
379 u128_inc(&ctrblk);
380 }
381
382 serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
383 (u8 *)ctrblocks);
384
385 src += SERPENT_PARALLEL_BLOCKS;
386 dst += SERPENT_PARALLEL_BLOCKS;
387 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
388 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
389
390 if (nbytes < bsize)
391 goto done;
392 }
393
394 /* Handle leftovers */
395 do {
396 if (dst != src)
397 *dst = *src;
398
399 u128_to_be128(&ctrblocks[0], &ctrblk);
400 u128_inc(&ctrblk);
401
402 __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
403 u128_xor(dst, dst, (u128 *)ctrblocks);
404
405 src += 1;
406 dst += 1;
407 nbytes -= bsize;
408 } while (nbytes >= bsize);
409
410done:
411 u128_to_be128((be128 *)walk->iv, &ctrblk);
412 return nbytes;
413}
414
415static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
416 struct scatterlist *src, unsigned int nbytes)
417{
418 bool fpu_enabled = false;
419 struct blkcipher_walk walk;
420 int err;
421
422 blkcipher_walk_init(&walk, dst, src, nbytes);
423 err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
424 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
425
426 while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
427 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
428 nbytes = __ctr_crypt(desc, &walk);
429 err = blkcipher_walk_done(desc, &walk, nbytes);
430 }
431
432 serpent_fpu_end(fpu_enabled);
433
434 if (walk.nbytes) {
435 ctr_crypt_final(desc, &walk);
436 err = blkcipher_walk_done(desc, &walk, 0);
437 }
438
439 return err;
440}
441
442static struct crypto_alg blk_ctr_alg = {
443 .cra_name = "__ctr-serpent-sse2",
444 .cra_driver_name = "__driver-ctr-serpent-sse2",
445 .cra_priority = 0,
446 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
447 .cra_blocksize = 1,
448 .cra_ctxsize = sizeof(struct serpent_ctx),
449 .cra_alignmask = 0,
450 .cra_type = &crypto_blkcipher_type,
451 .cra_module = THIS_MODULE,
452 .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
453 .cra_u = {
454 .blkcipher = {
455 .min_keysize = SERPENT_MIN_KEY_SIZE,
456 .max_keysize = SERPENT_MAX_KEY_SIZE,
457 .ivsize = SERPENT_BLOCK_SIZE,
458 .setkey = serpent_setkey,
459 .encrypt = ctr_crypt,
460 .decrypt = ctr_crypt,
461 },
462 },
463};
464
465struct crypt_priv {
466 struct serpent_ctx *ctx;
467 bool fpu_enabled;
468};
469
470static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
471{
472 const unsigned int bsize = SERPENT_BLOCK_SIZE;
473 struct crypt_priv *ctx = priv;
474 int i;
475
476 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
477
478 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
479 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
480 return;
481 }
482
483 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
484 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
485}
486
487static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
488{
489 const unsigned int bsize = SERPENT_BLOCK_SIZE;
490 struct crypt_priv *ctx = priv;
491 int i;
492
493 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
494
495 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
496 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
497 return;
498 }
499
500 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
501 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
502}
503
504struct serpent_lrw_ctx {
505 struct lrw_table_ctx lrw_table;
506 struct serpent_ctx serpent_ctx;
507};
508
509static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
510 unsigned int keylen)
511{
512 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
513 int err;
514
515 err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
516 SERPENT_BLOCK_SIZE);
517 if (err)
518 return err;
519
520 return lrw_init_table(&ctx->lrw_table, key + keylen -
521 SERPENT_BLOCK_SIZE);
522}
523
524static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
525 struct scatterlist *src, unsigned int nbytes)
526{
527 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
528 be128 buf[SERPENT_PARALLEL_BLOCKS];
529 struct crypt_priv crypt_ctx = {
530 .ctx = &ctx->serpent_ctx,
531 .fpu_enabled = false,
532 };
533 struct lrw_crypt_req req = {
534 .tbuf = buf,
535 .tbuflen = sizeof(buf),
536
537 .table_ctx = &ctx->lrw_table,
538 .crypt_ctx = &crypt_ctx,
539 .crypt_fn = encrypt_callback,
540 };
541 int ret;
542
543 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
544 ret = lrw_crypt(desc, dst, src, nbytes, &req);
545 serpent_fpu_end(crypt_ctx.fpu_enabled);
546
547 return ret;
548}
549
550static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
551 struct scatterlist *src, unsigned int nbytes)
552{
553 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
554 be128 buf[SERPENT_PARALLEL_BLOCKS];
555 struct crypt_priv crypt_ctx = {
556 .ctx = &ctx->serpent_ctx,
557 .fpu_enabled = false,
558 };
559 struct lrw_crypt_req req = {
560 .tbuf = buf,
561 .tbuflen = sizeof(buf),
562
563 .table_ctx = &ctx->lrw_table,
564 .crypt_ctx = &crypt_ctx,
565 .crypt_fn = decrypt_callback,
566 };
567 int ret;
568
569 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
570 ret = lrw_crypt(desc, dst, src, nbytes, &req);
571 serpent_fpu_end(crypt_ctx.fpu_enabled);
572
573 return ret;
574}
575
576static void lrw_exit_tfm(struct crypto_tfm *tfm)
577{
578 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
579
580 lrw_free_table(&ctx->lrw_table);
581}
582
583static struct crypto_alg blk_lrw_alg = {
584 .cra_name = "__lrw-serpent-sse2",
585 .cra_driver_name = "__driver-lrw-serpent-sse2",
586 .cra_priority = 0,
587 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
588 .cra_blocksize = SERPENT_BLOCK_SIZE,
589 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
590 .cra_alignmask = 0,
591 .cra_type = &crypto_blkcipher_type,
592 .cra_module = THIS_MODULE,
593 .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list),
594 .cra_exit = lrw_exit_tfm,
595 .cra_u = {
596 .blkcipher = {
597 .min_keysize = SERPENT_MIN_KEY_SIZE +
598 SERPENT_BLOCK_SIZE,
599 .max_keysize = SERPENT_MAX_KEY_SIZE +
600 SERPENT_BLOCK_SIZE,
601 .ivsize = SERPENT_BLOCK_SIZE,
602 .setkey = lrw_serpent_setkey,
603 .encrypt = lrw_encrypt,
604 .decrypt = lrw_decrypt,
605 },
606 },
607};
608
609struct serpent_xts_ctx {
610 struct serpent_ctx tweak_ctx;
611 struct serpent_ctx crypt_ctx;
612};
613
614static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
615 unsigned int keylen)
616{
617 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
618 u32 *flags = &tfm->crt_flags;
619 int err;
620
621 /* key consists of keys of equal size concatenated, therefore
622 * the length must be even
623 */
624 if (keylen % 2) {
625 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
626 return -EINVAL;
627 }
628
629 /* first half of xts-key is for crypt */
630 err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
631 if (err)
632 return err;
633
634 /* second half of xts-key is for tweak */
635 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
636}
637
638static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
639 struct scatterlist *src, unsigned int nbytes)
640{
641 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
642 be128 buf[SERPENT_PARALLEL_BLOCKS];
643 struct crypt_priv crypt_ctx = {
644 .ctx = &ctx->crypt_ctx,
645 .fpu_enabled = false,
646 };
647 struct xts_crypt_req req = {
648 .tbuf = buf,
649 .tbuflen = sizeof(buf),
650
651 .tweak_ctx = &ctx->tweak_ctx,
652 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
653 .crypt_ctx = &crypt_ctx,
654 .crypt_fn = encrypt_callback,
655 };
656 int ret;
657
658 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
659 ret = xts_crypt(desc, dst, src, nbytes, &req);
660 serpent_fpu_end(crypt_ctx.fpu_enabled);
661
662 return ret;
663}
664
665static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
666 struct scatterlist *src, unsigned int nbytes)
667{
668 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
669 be128 buf[SERPENT_PARALLEL_BLOCKS];
670 struct crypt_priv crypt_ctx = {
671 .ctx = &ctx->crypt_ctx,
672 .fpu_enabled = false,
673 };
674 struct xts_crypt_req req = {
675 .tbuf = buf,
676 .tbuflen = sizeof(buf),
677
678 .tweak_ctx = &ctx->tweak_ctx,
679 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
680 .crypt_ctx = &crypt_ctx,
681 .crypt_fn = decrypt_callback,
682 };
683 int ret;
684
685 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
686 ret = xts_crypt(desc, dst, src, nbytes, &req);
687 serpent_fpu_end(crypt_ctx.fpu_enabled);
688
689 return ret;
690}
691
692static struct crypto_alg blk_xts_alg = {
693 .cra_name = "__xts-serpent-sse2",
694 .cra_driver_name = "__driver-xts-serpent-sse2",
695 .cra_priority = 0,
696 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
697 .cra_blocksize = SERPENT_BLOCK_SIZE,
698 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
699 .cra_alignmask = 0,
700 .cra_type = &crypto_blkcipher_type,
701 .cra_module = THIS_MODULE,
702 .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list),
703 .cra_u = {
704 .blkcipher = {
705 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
706 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
707 .ivsize = SERPENT_BLOCK_SIZE,
708 .setkey = xts_serpent_setkey,
709 .encrypt = xts_encrypt,
710 .decrypt = xts_decrypt,
711 },
712 },
713};
714
715static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
716 unsigned int key_len)
717{
718 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
719 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
720 int err;
721
722 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
723 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
724 & CRYPTO_TFM_REQ_MASK);
725 err = crypto_ablkcipher_setkey(child, key, key_len);
726 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
727 & CRYPTO_TFM_RES_MASK);
728 return err;
729}
730
731static int __ablk_encrypt(struct ablkcipher_request *req)
732{
733 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
734 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
735 struct blkcipher_desc desc;
736
737 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
738 desc.info = req->info;
739 desc.flags = 0;
740
741 return crypto_blkcipher_crt(desc.tfm)->encrypt(
742 &desc, req->dst, req->src, req->nbytes);
743}
744
745static int ablk_encrypt(struct ablkcipher_request *req)
746{
747 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
748 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
749
750 if (!irq_fpu_usable()) {
751 struct ablkcipher_request *cryptd_req =
752 ablkcipher_request_ctx(req);
753
754 memcpy(cryptd_req, req, sizeof(*req));
755 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
756
757 return crypto_ablkcipher_encrypt(cryptd_req);
758 } else {
759 return __ablk_encrypt(req);
760 }
761}
762
763static int ablk_decrypt(struct ablkcipher_request *req)
764{
765 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
766 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
767
768 if (!irq_fpu_usable()) {
769 struct ablkcipher_request *cryptd_req =
770 ablkcipher_request_ctx(req);
771
772 memcpy(cryptd_req, req, sizeof(*req));
773 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
774
775 return crypto_ablkcipher_decrypt(cryptd_req);
776 } else {
777 struct blkcipher_desc desc;
778
779 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
780 desc.info = req->info;
781 desc.flags = 0;
782
783 return crypto_blkcipher_crt(desc.tfm)->decrypt(
784 &desc, req->dst, req->src, req->nbytes);
785 }
786}
787
788static void ablk_exit(struct crypto_tfm *tfm)
789{
790 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
791
792 cryptd_free_ablkcipher(ctx->cryptd_tfm);
793}
794
795static void ablk_init_common(struct crypto_tfm *tfm,
796 struct cryptd_ablkcipher *cryptd_tfm)
797{
798 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
799
800 ctx->cryptd_tfm = cryptd_tfm;
801 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
802 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
803}
804
805static int ablk_ecb_init(struct crypto_tfm *tfm)
806{
807 struct cryptd_ablkcipher *cryptd_tfm;
808
809 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
810 if (IS_ERR(cryptd_tfm))
811 return PTR_ERR(cryptd_tfm);
812 ablk_init_common(tfm, cryptd_tfm);
813 return 0;
814}
815
816static struct crypto_alg ablk_ecb_alg = {
817 .cra_name = "ecb(serpent)",
818 .cra_driver_name = "ecb-serpent-sse2",
819 .cra_priority = 400,
820 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
821 .cra_blocksize = SERPENT_BLOCK_SIZE,
822 .cra_ctxsize = sizeof(struct async_serpent_ctx),
823 .cra_alignmask = 0,
824 .cra_type = &crypto_ablkcipher_type,
825 .cra_module = THIS_MODULE,
826 .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
827 .cra_init = ablk_ecb_init,
828 .cra_exit = ablk_exit,
829 .cra_u = {
830 .ablkcipher = {
831 .min_keysize = SERPENT_MIN_KEY_SIZE,
832 .max_keysize = SERPENT_MAX_KEY_SIZE,
833 .setkey = ablk_set_key,
834 .encrypt = ablk_encrypt,
835 .decrypt = ablk_decrypt,
836 },
837 },
838};
839
840static int ablk_cbc_init(struct crypto_tfm *tfm)
841{
842 struct cryptd_ablkcipher *cryptd_tfm;
843
844 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
845 if (IS_ERR(cryptd_tfm))
846 return PTR_ERR(cryptd_tfm);
847 ablk_init_common(tfm, cryptd_tfm);
848 return 0;
849}
850
851static struct crypto_alg ablk_cbc_alg = {
852 .cra_name = "cbc(serpent)",
853 .cra_driver_name = "cbc-serpent-sse2",
854 .cra_priority = 400,
855 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
856 .cra_blocksize = SERPENT_BLOCK_SIZE,
857 .cra_ctxsize = sizeof(struct async_serpent_ctx),
858 .cra_alignmask = 0,
859 .cra_type = &crypto_ablkcipher_type,
860 .cra_module = THIS_MODULE,
861 .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
862 .cra_init = ablk_cbc_init,
863 .cra_exit = ablk_exit,
864 .cra_u = {
865 .ablkcipher = {
866 .min_keysize = SERPENT_MIN_KEY_SIZE,
867 .max_keysize = SERPENT_MAX_KEY_SIZE,
868 .ivsize = SERPENT_BLOCK_SIZE,
869 .setkey = ablk_set_key,
870 .encrypt = __ablk_encrypt,
871 .decrypt = ablk_decrypt,
872 },
873 },
874};
875
876static int ablk_ctr_init(struct crypto_tfm *tfm)
877{
878 struct cryptd_ablkcipher *cryptd_tfm;
879
880 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
881 if (IS_ERR(cryptd_tfm))
882 return PTR_ERR(cryptd_tfm);
883 ablk_init_common(tfm, cryptd_tfm);
884 return 0;
885}
886
887static struct crypto_alg ablk_ctr_alg = {
888 .cra_name = "ctr(serpent)",
889 .cra_driver_name = "ctr-serpent-sse2",
890 .cra_priority = 400,
891 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
892 .cra_blocksize = 1,
893 .cra_ctxsize = sizeof(struct async_serpent_ctx),
894 .cra_alignmask = 0,
895 .cra_type = &crypto_ablkcipher_type,
896 .cra_module = THIS_MODULE,
897 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
898 .cra_init = ablk_ctr_init,
899 .cra_exit = ablk_exit,
900 .cra_u = {
901 .ablkcipher = {
902 .min_keysize = SERPENT_MIN_KEY_SIZE,
903 .max_keysize = SERPENT_MAX_KEY_SIZE,
904 .ivsize = SERPENT_BLOCK_SIZE,
905 .setkey = ablk_set_key,
906 .encrypt = ablk_encrypt,
907 .decrypt = ablk_encrypt,
908 .geniv = "chainiv",
909 },
910 },
911};
912
913static int ablk_lrw_init(struct crypto_tfm *tfm)
914{
915 struct cryptd_ablkcipher *cryptd_tfm;
916
917 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0);
918 if (IS_ERR(cryptd_tfm))
919 return PTR_ERR(cryptd_tfm);
920 ablk_init_common(tfm, cryptd_tfm);
921 return 0;
922}
923
924static struct crypto_alg ablk_lrw_alg = {
925 .cra_name = "lrw(serpent)",
926 .cra_driver_name = "lrw-serpent-sse2",
927 .cra_priority = 400,
928 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
929 .cra_blocksize = SERPENT_BLOCK_SIZE,
930 .cra_ctxsize = sizeof(struct async_serpent_ctx),
931 .cra_alignmask = 0,
932 .cra_type = &crypto_ablkcipher_type,
933 .cra_module = THIS_MODULE,
934 .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
935 .cra_init = ablk_lrw_init,
936 .cra_exit = ablk_exit,
937 .cra_u = {
938 .ablkcipher = {
939 .min_keysize = SERPENT_MIN_KEY_SIZE +
940 SERPENT_BLOCK_SIZE,
941 .max_keysize = SERPENT_MAX_KEY_SIZE +
942 SERPENT_BLOCK_SIZE,
943 .ivsize = SERPENT_BLOCK_SIZE,
944 .setkey = ablk_set_key,
945 .encrypt = ablk_encrypt,
946 .decrypt = ablk_decrypt,
947 },
948 },
949};
950
951static int ablk_xts_init(struct crypto_tfm *tfm)
952{
953 struct cryptd_ablkcipher *cryptd_tfm;
954
955 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0);
956 if (IS_ERR(cryptd_tfm))
957 return PTR_ERR(cryptd_tfm);
958 ablk_init_common(tfm, cryptd_tfm);
959 return 0;
960}
961
962static struct crypto_alg ablk_xts_alg = {
963 .cra_name = "xts(serpent)",
964 .cra_driver_name = "xts-serpent-sse2",
965 .cra_priority = 400,
966 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
967 .cra_blocksize = SERPENT_BLOCK_SIZE,
968 .cra_ctxsize = sizeof(struct async_serpent_ctx),
969 .cra_alignmask = 0,
970 .cra_type = &crypto_ablkcipher_type,
971 .cra_module = THIS_MODULE,
972 .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
973 .cra_init = ablk_xts_init,
974 .cra_exit = ablk_exit,
975 .cra_u = {
976 .ablkcipher = {
977 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
978 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
979 .ivsize = SERPENT_BLOCK_SIZE,
980 .setkey = ablk_set_key,
981 .encrypt = ablk_encrypt,
982 .decrypt = ablk_decrypt,
983 },
984 },
985};
986
987static int __init serpent_sse2_init(void)
988{
989 int err;
990
991 if (!cpu_has_xmm2) {
992 printk(KERN_INFO "SSE2 instructions are not detected.\n");
993 return -ENODEV;
994 }
995
996 err = crypto_register_alg(&blk_ecb_alg);
997 if (err)
998 goto blk_ecb_err;
999 err = crypto_register_alg(&blk_cbc_alg);
1000 if (err)
1001 goto blk_cbc_err;
1002 err = crypto_register_alg(&blk_ctr_alg);
1003 if (err)
1004 goto blk_ctr_err;
1005 err = crypto_register_alg(&ablk_ecb_alg);
1006 if (err)
1007 goto ablk_ecb_err;
1008 err = crypto_register_alg(&ablk_cbc_alg);
1009 if (err)
1010 goto ablk_cbc_err;
1011 err = crypto_register_alg(&ablk_ctr_alg);
1012 if (err)
1013 goto ablk_ctr_err;
1014 err = crypto_register_alg(&blk_lrw_alg);
1015 if (err)
1016 goto blk_lrw_err;
1017 err = crypto_register_alg(&ablk_lrw_alg);
1018 if (err)
1019 goto ablk_lrw_err;
1020 err = crypto_register_alg(&blk_xts_alg);
1021 if (err)
1022 goto blk_xts_err;
1023 err = crypto_register_alg(&ablk_xts_alg);
1024 if (err)
1025 goto ablk_xts_err;
1026 return err;
1027
1028 crypto_unregister_alg(&ablk_xts_alg);
1029ablk_xts_err:
1030 crypto_unregister_alg(&blk_xts_alg);
1031blk_xts_err:
1032 crypto_unregister_alg(&ablk_lrw_alg);
1033ablk_lrw_err:
1034 crypto_unregister_alg(&blk_lrw_alg);
1035blk_lrw_err:
1036 crypto_unregister_alg(&ablk_ctr_alg);
1037ablk_ctr_err:
1038 crypto_unregister_alg(&ablk_cbc_alg);
1039ablk_cbc_err:
1040 crypto_unregister_alg(&ablk_ecb_alg);
1041ablk_ecb_err:
1042 crypto_unregister_alg(&blk_ctr_alg);
1043blk_ctr_err:
1044 crypto_unregister_alg(&blk_cbc_alg);
1045blk_cbc_err:
1046 crypto_unregister_alg(&blk_ecb_alg);
1047blk_ecb_err:
1048 return err;
1049}
1050
1051static void __exit serpent_sse2_exit(void)
1052{
1053 crypto_unregister_alg(&ablk_xts_alg);
1054 crypto_unregister_alg(&blk_xts_alg);
1055 crypto_unregister_alg(&ablk_lrw_alg);
1056 crypto_unregister_alg(&blk_lrw_alg);
1057 crypto_unregister_alg(&ablk_ctr_alg);
1058 crypto_unregister_alg(&ablk_cbc_alg);
1059 crypto_unregister_alg(&ablk_ecb_alg);
1060 crypto_unregister_alg(&blk_ctr_alg);
1061 crypto_unregister_alg(&blk_cbc_alg);
1062 crypto_unregister_alg(&blk_ecb_alg);
1063}
1064
1065module_init(serpent_sse2_init);
1066module_exit(serpent_sse2_exit);
1067
1068MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
1069MODULE_LICENSE("GPL");
1070MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 5ede9c444c3e..7fee8c152f93 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -32,6 +32,8 @@
32#include <crypto/algapi.h> 32#include <crypto/algapi.h>
33#include <crypto/twofish.h> 33#include <crypto/twofish.h>
34#include <crypto/b128ops.h> 34#include <crypto/b128ops.h>
35#include <crypto/lrw.h>
36#include <crypto/xts.h>
35 37
36/* regular block cipher functions from twofish_x86_64 module */ 38/* regular block cipher functions from twofish_x86_64 module */
37asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, 39asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
@@ -432,6 +434,209 @@ static struct crypto_alg blk_ctr_alg = {
432 }, 434 },
433}; 435};
434 436
437static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
438{
439 const unsigned int bsize = TF_BLOCK_SIZE;
440 struct twofish_ctx *ctx = priv;
441 int i;
442
443 if (nbytes == 3 * bsize) {
444 twofish_enc_blk_3way(ctx, srcdst, srcdst);
445 return;
446 }
447
448 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
449 twofish_enc_blk(ctx, srcdst, srcdst);
450}
451
452static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
453{
454 const unsigned int bsize = TF_BLOCK_SIZE;
455 struct twofish_ctx *ctx = priv;
456 int i;
457
458 if (nbytes == 3 * bsize) {
459 twofish_dec_blk_3way(ctx, srcdst, srcdst);
460 return;
461 }
462
463 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
464 twofish_dec_blk(ctx, srcdst, srcdst);
465}
466
467struct twofish_lrw_ctx {
468 struct lrw_table_ctx lrw_table;
469 struct twofish_ctx twofish_ctx;
470};
471
472static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
473 unsigned int keylen)
474{
475 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
476 int err;
477
478 err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE,
479 &tfm->crt_flags);
480 if (err)
481 return err;
482
483 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
484}
485
486static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
487 struct scatterlist *src, unsigned int nbytes)
488{
489 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
490 be128 buf[3];
491 struct lrw_crypt_req req = {
492 .tbuf = buf,
493 .tbuflen = sizeof(buf),
494
495 .table_ctx = &ctx->lrw_table,
496 .crypt_ctx = &ctx->twofish_ctx,
497 .crypt_fn = encrypt_callback,
498 };
499
500 return lrw_crypt(desc, dst, src, nbytes, &req);
501}
502
503static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
504 struct scatterlist *src, unsigned int nbytes)
505{
506 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
507 be128 buf[3];
508 struct lrw_crypt_req req = {
509 .tbuf = buf,
510 .tbuflen = sizeof(buf),
511
512 .table_ctx = &ctx->lrw_table,
513 .crypt_ctx = &ctx->twofish_ctx,
514 .crypt_fn = decrypt_callback,
515 };
516
517 return lrw_crypt(desc, dst, src, nbytes, &req);
518}
519
520static void lrw_exit_tfm(struct crypto_tfm *tfm)
521{
522 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
523
524 lrw_free_table(&ctx->lrw_table);
525}
526
527static struct crypto_alg blk_lrw_alg = {
528 .cra_name = "lrw(twofish)",
529 .cra_driver_name = "lrw-twofish-3way",
530 .cra_priority = 300,
531 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
532 .cra_blocksize = TF_BLOCK_SIZE,
533 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
534 .cra_alignmask = 0,
535 .cra_type = &crypto_blkcipher_type,
536 .cra_module = THIS_MODULE,
537 .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list),
538 .cra_exit = lrw_exit_tfm,
539 .cra_u = {
540 .blkcipher = {
541 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
542 .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE,
543 .ivsize = TF_BLOCK_SIZE,
544 .setkey = lrw_twofish_setkey,
545 .encrypt = lrw_encrypt,
546 .decrypt = lrw_decrypt,
547 },
548 },
549};
550
551struct twofish_xts_ctx {
552 struct twofish_ctx tweak_ctx;
553 struct twofish_ctx crypt_ctx;
554};
555
556static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
557 unsigned int keylen)
558{
559 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
560 u32 *flags = &tfm->crt_flags;
561 int err;
562
563 /* key consists of keys of equal size concatenated, therefore
564 * the length must be even
565 */
566 if (keylen % 2) {
567 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
568 return -EINVAL;
569 }
570
571 /* first half of xts-key is for crypt */
572 err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
573 if (err)
574 return err;
575
576 /* second half of xts-key is for tweak */
577 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
578 flags);
579}
580
581static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
582 struct scatterlist *src, unsigned int nbytes)
583{
584 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
585 be128 buf[3];
586 struct xts_crypt_req req = {
587 .tbuf = buf,
588 .tbuflen = sizeof(buf),
589
590 .tweak_ctx = &ctx->tweak_ctx,
591 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
592 .crypt_ctx = &ctx->crypt_ctx,
593 .crypt_fn = encrypt_callback,
594 };
595
596 return xts_crypt(desc, dst, src, nbytes, &req);
597}
598
599static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
600 struct scatterlist *src, unsigned int nbytes)
601{
602 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
603 be128 buf[3];
604 struct xts_crypt_req req = {
605 .tbuf = buf,
606 .tbuflen = sizeof(buf),
607
608 .tweak_ctx = &ctx->tweak_ctx,
609 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
610 .crypt_ctx = &ctx->crypt_ctx,
611 .crypt_fn = decrypt_callback,
612 };
613
614 return xts_crypt(desc, dst, src, nbytes, &req);
615}
616
617static struct crypto_alg blk_xts_alg = {
618 .cra_name = "xts(twofish)",
619 .cra_driver_name = "xts-twofish-3way",
620 .cra_priority = 300,
621 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
622 .cra_blocksize = TF_BLOCK_SIZE,
623 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
624 .cra_alignmask = 0,
625 .cra_type = &crypto_blkcipher_type,
626 .cra_module = THIS_MODULE,
627 .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list),
628 .cra_u = {
629 .blkcipher = {
630 .min_keysize = TF_MIN_KEY_SIZE * 2,
631 .max_keysize = TF_MAX_KEY_SIZE * 2,
632 .ivsize = TF_BLOCK_SIZE,
633 .setkey = xts_twofish_setkey,
634 .encrypt = xts_encrypt,
635 .decrypt = xts_decrypt,
636 },
637 },
638};
639
435int __init init(void) 640int __init init(void)
436{ 641{
437 int err; 642 int err;
@@ -445,9 +650,20 @@ int __init init(void)
445 err = crypto_register_alg(&blk_ctr_alg); 650 err = crypto_register_alg(&blk_ctr_alg);
446 if (err) 651 if (err)
447 goto ctr_err; 652 goto ctr_err;
653 err = crypto_register_alg(&blk_lrw_alg);
654 if (err)
655 goto blk_lrw_err;
656 err = crypto_register_alg(&blk_xts_alg);
657 if (err)
658 goto blk_xts_err;
448 659
449 return 0; 660 return 0;
450 661
662 crypto_unregister_alg(&blk_xts_alg);
663blk_xts_err:
664 crypto_unregister_alg(&blk_lrw_alg);
665blk_lrw_err:
666 crypto_unregister_alg(&blk_ctr_alg);
451ctr_err: 667ctr_err:
452 crypto_unregister_alg(&blk_cbc_alg); 668 crypto_unregister_alg(&blk_cbc_alg);
453cbc_err: 669cbc_err:
@@ -458,6 +674,8 @@ ecb_err:
458 674
459void __exit fini(void) 675void __exit fini(void)
460{ 676{
677 crypto_unregister_alg(&blk_xts_alg);
678 crypto_unregister_alg(&blk_lrw_alg);
461 crypto_unregister_alg(&blk_ctr_alg); 679 crypto_unregister_alg(&blk_ctr_alg);
462 crypto_unregister_alg(&blk_cbc_alg); 680 crypto_unregister_alg(&blk_cbc_alg);
463 crypto_unregister_alg(&blk_ecb_alg); 681 crypto_unregister_alg(&blk_ecb_alg);
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 52d0ccfcf6ea..455646e0e532 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,6 +3,7 @@
3# 3#
4 4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o 5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
6obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
6 7
7sysv-$(CONFIG_SYSVIPC) := ipc32.o 8sysv-$(CONFIG_SYSVIPC) := ipc32.o
8obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) 9obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a6253ec1b284..e3e734005e19 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -14,6 +14,7 @@
14#include <asm/segment.h> 14#include <asm/segment.h>
15#include <asm/irqflags.h> 15#include <asm/irqflags.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17#include <linux/err.h>
17 18
18/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 19/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
19#include <linux/elf-em.h> 20#include <linux/elf-em.h>
@@ -27,8 +28,6 @@
27 28
28 .section .entry.text, "ax" 29 .section .entry.text, "ax"
29 30
30#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
31
32 .macro IA32_ARG_FIXUP noebp=0 31 .macro IA32_ARG_FIXUP noebp=0
33 movl %edi,%r8d 32 movl %edi,%r8d
34 .if \noebp 33 .if \noebp
@@ -134,7 +133,7 @@ ENTRY(ia32_sysenter_target)
134 CFI_REL_OFFSET rsp,0 133 CFI_REL_OFFSET rsp,0
135 pushfq_cfi 134 pushfq_cfi
136 /*CFI_REL_OFFSET rflags,0*/ 135 /*CFI_REL_OFFSET rflags,0*/
137 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d 136 movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
138 CFI_REGISTER rip,r10 137 CFI_REGISTER rip,r10
139 pushq_cfi $__USER32_CS 138 pushq_cfi $__USER32_CS
140 /*CFI_REL_OFFSET cs,0*/ 139 /*CFI_REL_OFFSET cs,0*/
@@ -150,9 +149,8 @@ ENTRY(ia32_sysenter_target)
150 .section __ex_table,"a" 149 .section __ex_table,"a"
151 .quad 1b,ia32_badarg 150 .quad 1b,ia32_badarg
152 .previous 151 .previous
153 GET_THREAD_INFO(%r10) 152 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
154 orl $TS_COMPAT,TI_status(%r10) 153 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
155 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
156 CFI_REMEMBER_STATE 154 CFI_REMEMBER_STATE
157 jnz sysenter_tracesys 155 jnz sysenter_tracesys
158 cmpq $(IA32_NR_syscalls-1),%rax 156 cmpq $(IA32_NR_syscalls-1),%rax
@@ -162,13 +160,12 @@ sysenter_do_call:
162sysenter_dispatch: 160sysenter_dispatch:
163 call *ia32_sys_call_table(,%rax,8) 161 call *ia32_sys_call_table(,%rax,8)
164 movq %rax,RAX-ARGOFFSET(%rsp) 162 movq %rax,RAX-ARGOFFSET(%rsp)
165 GET_THREAD_INFO(%r10)
166 DISABLE_INTERRUPTS(CLBR_NONE) 163 DISABLE_INTERRUPTS(CLBR_NONE)
167 TRACE_IRQS_OFF 164 TRACE_IRQS_OFF
168 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 165 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
169 jnz sysexit_audit 166 jnz sysexit_audit
170sysexit_from_sys_call: 167sysexit_from_sys_call:
171 andl $~TS_COMPAT,TI_status(%r10) 168 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
172 /* clear IF, that popfq doesn't enable interrupts early */ 169 /* clear IF, that popfq doesn't enable interrupts early */
173 andl $~0x200,EFLAGS-R11(%rsp) 170 andl $~0x200,EFLAGS-R11(%rsp)
174 movl RIP-R11(%rsp),%edx /* User %eip */ 171 movl RIP-R11(%rsp),%edx /* User %eip */
@@ -193,7 +190,7 @@ sysexit_from_sys_call:
193 movl %ebx,%edx /* 3rd arg: 1st syscall arg */ 190 movl %ebx,%edx /* 3rd arg: 1st syscall arg */
194 movl %eax,%esi /* 2nd arg: syscall number */ 191 movl %eax,%esi /* 2nd arg: syscall number */
195 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ 192 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
196 call audit_syscall_entry 193 call __audit_syscall_entry
197 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ 194 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
198 cmpq $(IA32_NR_syscalls-1),%rax 195 cmpq $(IA32_NR_syscalls-1),%rax
199 ja ia32_badsys 196 ja ia32_badsys
@@ -205,22 +202,22 @@ sysexit_from_sys_call:
205 .endm 202 .endm
206 203
207 .macro auditsys_exit exit 204 .macro auditsys_exit exit
208 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 205 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
209 jnz ia32_ret_from_sys_call 206 jnz ia32_ret_from_sys_call
210 TRACE_IRQS_ON 207 TRACE_IRQS_ON
211 sti 208 sti
212 movl %eax,%esi /* second arg, syscall return value */ 209 movl %eax,%esi /* second arg, syscall return value */
213 cmpl $0,%eax /* is it < 0? */ 210 cmpl $-MAX_ERRNO,%eax /* is it an error ? */
214 setl %al /* 1 if so, 0 if not */ 211 jbe 1f
212 movslq %eax, %rsi /* if error sign extend to 64 bits */
2131: setbe %al /* 1 if error, 0 if not */
215 movzbl %al,%edi /* zero-extend that into %edi */ 214 movzbl %al,%edi /* zero-extend that into %edi */
216 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 215 call __audit_syscall_exit
217 call audit_syscall_exit 216 movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */
218 GET_THREAD_INFO(%r10)
219 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
220 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 217 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
221 cli 218 cli
222 TRACE_IRQS_OFF 219 TRACE_IRQS_OFF
223 testl %edi,TI_flags(%r10) 220 testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
224 jz \exit 221 jz \exit
225 CLEAR_RREGS -ARGOFFSET 222 CLEAR_RREGS -ARGOFFSET
226 jmp int_with_check 223 jmp int_with_check
@@ -238,7 +235,7 @@ sysexit_audit:
238 235
239sysenter_tracesys: 236sysenter_tracesys:
240#ifdef CONFIG_AUDITSYSCALL 237#ifdef CONFIG_AUDITSYSCALL
241 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 238 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
242 jz sysenter_auditsys 239 jz sysenter_auditsys
243#endif 240#endif
244 SAVE_REST 241 SAVE_REST
@@ -309,9 +306,8 @@ ENTRY(ia32_cstar_target)
309 .section __ex_table,"a" 306 .section __ex_table,"a"
310 .quad 1b,ia32_badarg 307 .quad 1b,ia32_badarg
311 .previous 308 .previous
312 GET_THREAD_INFO(%r10) 309 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
313 orl $TS_COMPAT,TI_status(%r10) 310 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
314 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
315 CFI_REMEMBER_STATE 311 CFI_REMEMBER_STATE
316 jnz cstar_tracesys 312 jnz cstar_tracesys
317 cmpq $IA32_NR_syscalls-1,%rax 313 cmpq $IA32_NR_syscalls-1,%rax
@@ -321,13 +317,12 @@ cstar_do_call:
321cstar_dispatch: 317cstar_dispatch:
322 call *ia32_sys_call_table(,%rax,8) 318 call *ia32_sys_call_table(,%rax,8)
323 movq %rax,RAX-ARGOFFSET(%rsp) 319 movq %rax,RAX-ARGOFFSET(%rsp)
324 GET_THREAD_INFO(%r10)
325 DISABLE_INTERRUPTS(CLBR_NONE) 320 DISABLE_INTERRUPTS(CLBR_NONE)
326 TRACE_IRQS_OFF 321 TRACE_IRQS_OFF
327 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 322 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
328 jnz sysretl_audit 323 jnz sysretl_audit
329sysretl_from_sys_call: 324sysretl_from_sys_call:
330 andl $~TS_COMPAT,TI_status(%r10) 325 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
331 RESTORE_ARGS 0,-ARG_SKIP,0,0,0 326 RESTORE_ARGS 0,-ARG_SKIP,0,0,0
332 movl RIP-ARGOFFSET(%rsp),%ecx 327 movl RIP-ARGOFFSET(%rsp),%ecx
333 CFI_REGISTER rip,rcx 328 CFI_REGISTER rip,rcx
@@ -355,7 +350,7 @@ sysretl_audit:
355 350
356cstar_tracesys: 351cstar_tracesys:
357#ifdef CONFIG_AUDITSYSCALL 352#ifdef CONFIG_AUDITSYSCALL
358 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 353 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
359 jz cstar_auditsys 354 jz cstar_auditsys
360#endif 355#endif
361 xchgl %r9d,%ebp 356 xchgl %r9d,%ebp
@@ -420,9 +415,8 @@ ENTRY(ia32_syscall)
420 /* note the registers are not zero extended to the sf. 415 /* note the registers are not zero extended to the sf.
421 this could be a problem. */ 416 this could be a problem. */
422 SAVE_ARGS 0,1,0 417 SAVE_ARGS 0,1,0
423 GET_THREAD_INFO(%r10) 418 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
424 orl $TS_COMPAT,TI_status(%r10) 419 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
425 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
426 jnz ia32_tracesys 420 jnz ia32_tracesys
427 cmpq $(IA32_NR_syscalls-1),%rax 421 cmpq $(IA32_NR_syscalls-1),%rax
428 ja ia32_badsys 422 ja ia32_badsys
@@ -453,14 +447,11 @@ ia32_badsys:
453 movq $-ENOSYS,%rax 447 movq $-ENOSYS,%rax
454 jmp ia32_sysret 448 jmp ia32_sysret
455 449
456quiet_ni_syscall:
457 movq $-ENOSYS,%rax
458 ret
459 CFI_ENDPROC 450 CFI_ENDPROC
460 451
461 .macro PTREGSCALL label, func, arg 452 .macro PTREGSCALL label, func, arg
462 .globl \label 453 ALIGN
463\label: 454GLOBAL(\label)
464 leaq \func(%rip),%rax 455 leaq \func(%rip),%rax
465 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ 456 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
466 jmp ia32_ptregs_common 457 jmp ia32_ptregs_common
@@ -477,7 +468,8 @@ quiet_ni_syscall:
477 PTREGSCALL stub32_vfork, sys_vfork, %rdi 468 PTREGSCALL stub32_vfork, sys_vfork, %rdi
478 PTREGSCALL stub32_iopl, sys_iopl, %rsi 469 PTREGSCALL stub32_iopl, sys_iopl, %rsi
479 470
480ENTRY(ia32_ptregs_common) 471 ALIGN
472ia32_ptregs_common:
481 popq %r11 473 popq %r11
482 CFI_ENDPROC 474 CFI_ENDPROC
483 CFI_STARTPROC32 simple 475 CFI_STARTPROC32 simple
@@ -499,357 +491,3 @@ ENTRY(ia32_ptregs_common)
499 jmp ia32_sysret /* misbalances the return cache */ 491 jmp ia32_sysret /* misbalances the return cache */
500 CFI_ENDPROC 492 CFI_ENDPROC
501END(ia32_ptregs_common) 493END(ia32_ptregs_common)
502
503 .section .rodata,"a"
504 .align 8
505ia32_sys_call_table:
506 .quad sys_restart_syscall
507 .quad sys_exit
508 .quad stub32_fork
509 .quad sys_read
510 .quad sys_write
511 .quad compat_sys_open /* 5 */
512 .quad sys_close
513 .quad sys32_waitpid
514 .quad sys_creat
515 .quad sys_link
516 .quad sys_unlink /* 10 */
517 .quad stub32_execve
518 .quad sys_chdir
519 .quad compat_sys_time
520 .quad sys_mknod
521 .quad sys_chmod /* 15 */
522 .quad sys_lchown16
523 .quad quiet_ni_syscall /* old break syscall holder */
524 .quad sys_stat
525 .quad sys32_lseek
526 .quad sys_getpid /* 20 */
527 .quad compat_sys_mount /* mount */
528 .quad sys_oldumount /* old_umount */
529 .quad sys_setuid16
530 .quad sys_getuid16
531 .quad compat_sys_stime /* stime */ /* 25 */
532 .quad compat_sys_ptrace /* ptrace */
533 .quad sys_alarm
534 .quad sys_fstat /* (old)fstat */
535 .quad sys_pause
536 .quad compat_sys_utime /* 30 */
537 .quad quiet_ni_syscall /* old stty syscall holder */
538 .quad quiet_ni_syscall /* old gtty syscall holder */
539 .quad sys_access
540 .quad sys_nice
541 .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
542 .quad sys_sync
543 .quad sys32_kill
544 .quad sys_rename
545 .quad sys_mkdir
546 .quad sys_rmdir /* 40 */
547 .quad sys_dup
548 .quad sys_pipe
549 .quad compat_sys_times
550 .quad quiet_ni_syscall /* old prof syscall holder */
551 .quad sys_brk /* 45 */
552 .quad sys_setgid16
553 .quad sys_getgid16
554 .quad sys_signal
555 .quad sys_geteuid16
556 .quad sys_getegid16 /* 50 */
557 .quad sys_acct
558 .quad sys_umount /* new_umount */
559 .quad quiet_ni_syscall /* old lock syscall holder */
560 .quad compat_sys_ioctl
561 .quad compat_sys_fcntl64 /* 55 */
562 .quad quiet_ni_syscall /* old mpx syscall holder */
563 .quad sys_setpgid
564 .quad quiet_ni_syscall /* old ulimit syscall holder */
565 .quad sys_olduname
566 .quad sys_umask /* 60 */
567 .quad sys_chroot
568 .quad compat_sys_ustat
569 .quad sys_dup2
570 .quad sys_getppid
571 .quad sys_getpgrp /* 65 */
572 .quad sys_setsid
573 .quad sys32_sigaction
574 .quad sys_sgetmask
575 .quad sys_ssetmask
576 .quad sys_setreuid16 /* 70 */
577 .quad sys_setregid16
578 .quad sys32_sigsuspend
579 .quad compat_sys_sigpending
580 .quad sys_sethostname
581 .quad compat_sys_setrlimit /* 75 */
582 .quad compat_sys_old_getrlimit /* old_getrlimit */
583 .quad compat_sys_getrusage
584 .quad compat_sys_gettimeofday
585 .quad compat_sys_settimeofday
586 .quad sys_getgroups16 /* 80 */
587 .quad sys_setgroups16
588 .quad compat_sys_old_select
589 .quad sys_symlink
590 .quad sys_lstat
591 .quad sys_readlink /* 85 */
592 .quad sys_uselib
593 .quad sys_swapon
594 .quad sys_reboot
595 .quad compat_sys_old_readdir
596 .quad sys32_mmap /* 90 */
597 .quad sys_munmap
598 .quad sys_truncate
599 .quad sys_ftruncate
600 .quad sys_fchmod
601 .quad sys_fchown16 /* 95 */
602 .quad sys_getpriority
603 .quad sys_setpriority
604 .quad quiet_ni_syscall /* old profil syscall holder */
605 .quad compat_sys_statfs
606 .quad compat_sys_fstatfs /* 100 */
607 .quad sys_ioperm
608 .quad compat_sys_socketcall
609 .quad sys_syslog
610 .quad compat_sys_setitimer
611 .quad compat_sys_getitimer /* 105 */
612 .quad compat_sys_newstat
613 .quad compat_sys_newlstat
614 .quad compat_sys_newfstat
615 .quad sys_uname
616 .quad stub32_iopl /* 110 */
617 .quad sys_vhangup
618 .quad quiet_ni_syscall /* old "idle" system call */
619 .quad sys32_vm86_warning /* vm86old */
620 .quad compat_sys_wait4
621 .quad sys_swapoff /* 115 */
622 .quad compat_sys_sysinfo
623 .quad sys32_ipc
624 .quad sys_fsync
625 .quad stub32_sigreturn
626 .quad stub32_clone /* 120 */
627 .quad sys_setdomainname
628 .quad sys_newuname
629 .quad sys_modify_ldt
630 .quad compat_sys_adjtimex
631 .quad sys32_mprotect /* 125 */
632 .quad compat_sys_sigprocmask
633 .quad quiet_ni_syscall /* create_module */
634 .quad sys_init_module
635 .quad sys_delete_module
636 .quad quiet_ni_syscall /* 130 get_kernel_syms */
637 .quad sys32_quotactl
638 .quad sys_getpgid
639 .quad sys_fchdir
640 .quad quiet_ni_syscall /* bdflush */
641 .quad sys_sysfs /* 135 */
642 .quad sys_personality
643 .quad quiet_ni_syscall /* for afs_syscall */
644 .quad sys_setfsuid16
645 .quad sys_setfsgid16
646 .quad sys_llseek /* 140 */
647 .quad compat_sys_getdents
648 .quad compat_sys_select
649 .quad sys_flock
650 .quad sys_msync
651 .quad compat_sys_readv /* 145 */
652 .quad compat_sys_writev
653 .quad sys_getsid
654 .quad sys_fdatasync
655 .quad compat_sys_sysctl /* sysctl */
656 .quad sys_mlock /* 150 */
657 .quad sys_munlock
658 .quad sys_mlockall
659 .quad sys_munlockall
660 .quad sys_sched_setparam
661 .quad sys_sched_getparam /* 155 */
662 .quad sys_sched_setscheduler
663 .quad sys_sched_getscheduler
664 .quad sys_sched_yield
665 .quad sys_sched_get_priority_max
666 .quad sys_sched_get_priority_min /* 160 */
667 .quad sys32_sched_rr_get_interval
668 .quad compat_sys_nanosleep
669 .quad sys_mremap
670 .quad sys_setresuid16
671 .quad sys_getresuid16 /* 165 */
672 .quad sys32_vm86_warning /* vm86 */
673 .quad quiet_ni_syscall /* query_module */
674 .quad sys_poll
675 .quad quiet_ni_syscall /* old nfsservctl */
676 .quad sys_setresgid16 /* 170 */
677 .quad sys_getresgid16
678 .quad sys_prctl
679 .quad stub32_rt_sigreturn
680 .quad sys32_rt_sigaction
681 .quad sys32_rt_sigprocmask /* 175 */
682 .quad sys32_rt_sigpending
683 .quad compat_sys_rt_sigtimedwait
684 .quad sys32_rt_sigqueueinfo
685 .quad sys_rt_sigsuspend
686 .quad sys32_pread /* 180 */
687 .quad sys32_pwrite
688 .quad sys_chown16
689 .quad sys_getcwd
690 .quad sys_capget
691 .quad sys_capset
692 .quad stub32_sigaltstack
693 .quad sys32_sendfile
694 .quad quiet_ni_syscall /* streams1 */
695 .quad quiet_ni_syscall /* streams2 */
696 .quad stub32_vfork /* 190 */
697 .quad compat_sys_getrlimit
698 .quad sys_mmap_pgoff
699 .quad sys32_truncate64
700 .quad sys32_ftruncate64
701 .quad sys32_stat64 /* 195 */
702 .quad sys32_lstat64
703 .quad sys32_fstat64
704 .quad sys_lchown
705 .quad sys_getuid
706 .quad sys_getgid /* 200 */
707 .quad sys_geteuid
708 .quad sys_getegid
709 .quad sys_setreuid
710 .quad sys_setregid
711 .quad sys_getgroups /* 205 */
712 .quad sys_setgroups
713 .quad sys_fchown
714 .quad sys_setresuid
715 .quad sys_getresuid
716 .quad sys_setresgid /* 210 */
717 .quad sys_getresgid
718 .quad sys_chown
719 .quad sys_setuid
720 .quad sys_setgid
721 .quad sys_setfsuid /* 215 */
722 .quad sys_setfsgid
723 .quad sys_pivot_root
724 .quad sys_mincore
725 .quad sys_madvise
726 .quad compat_sys_getdents64 /* 220 getdents64 */
727 .quad compat_sys_fcntl64
728 .quad quiet_ni_syscall /* tux */
729 .quad quiet_ni_syscall /* security */
730 .quad sys_gettid
731 .quad sys32_readahead /* 225 */
732 .quad sys_setxattr
733 .quad sys_lsetxattr
734 .quad sys_fsetxattr
735 .quad sys_getxattr
736 .quad sys_lgetxattr /* 230 */
737 .quad sys_fgetxattr
738 .quad sys_listxattr
739 .quad sys_llistxattr
740 .quad sys_flistxattr
741 .quad sys_removexattr /* 235 */
742 .quad sys_lremovexattr
743 .quad sys_fremovexattr
744 .quad sys_tkill
745 .quad sys_sendfile64
746 .quad compat_sys_futex /* 240 */
747 .quad compat_sys_sched_setaffinity
748 .quad compat_sys_sched_getaffinity
749 .quad sys_set_thread_area
750 .quad sys_get_thread_area
751 .quad compat_sys_io_setup /* 245 */
752 .quad sys_io_destroy
753 .quad compat_sys_io_getevents
754 .quad compat_sys_io_submit
755 .quad sys_io_cancel
756 .quad sys32_fadvise64 /* 250 */
757 .quad quiet_ni_syscall /* free_huge_pages */
758 .quad sys_exit_group
759 .quad sys32_lookup_dcookie
760 .quad sys_epoll_create
761 .quad sys_epoll_ctl /* 255 */
762 .quad sys_epoll_wait
763 .quad sys_remap_file_pages
764 .quad sys_set_tid_address
765 .quad compat_sys_timer_create
766 .quad compat_sys_timer_settime /* 260 */
767 .quad compat_sys_timer_gettime
768 .quad sys_timer_getoverrun
769 .quad sys_timer_delete
770 .quad compat_sys_clock_settime
771 .quad compat_sys_clock_gettime /* 265 */
772 .quad compat_sys_clock_getres
773 .quad compat_sys_clock_nanosleep
774 .quad compat_sys_statfs64
775 .quad compat_sys_fstatfs64
776 .quad sys_tgkill /* 270 */
777 .quad compat_sys_utimes
778 .quad sys32_fadvise64_64
779 .quad quiet_ni_syscall /* sys_vserver */
780 .quad sys_mbind
781 .quad compat_sys_get_mempolicy /* 275 */
782 .quad sys_set_mempolicy
783 .quad compat_sys_mq_open
784 .quad sys_mq_unlink
785 .quad compat_sys_mq_timedsend
786 .quad compat_sys_mq_timedreceive /* 280 */
787 .quad compat_sys_mq_notify
788 .quad compat_sys_mq_getsetattr
789 .quad compat_sys_kexec_load /* reserved for kexec */
790 .quad compat_sys_waitid
791 .quad quiet_ni_syscall /* 285: sys_altroot */
792 .quad sys_add_key
793 .quad sys_request_key
794 .quad sys_keyctl
795 .quad sys_ioprio_set
796 .quad sys_ioprio_get /* 290 */
797 .quad sys_inotify_init
798 .quad sys_inotify_add_watch
799 .quad sys_inotify_rm_watch
800 .quad sys_migrate_pages
801 .quad compat_sys_openat /* 295 */
802 .quad sys_mkdirat
803 .quad sys_mknodat
804 .quad sys_fchownat
805 .quad compat_sys_futimesat
806 .quad sys32_fstatat /* 300 */
807 .quad sys_unlinkat
808 .quad sys_renameat
809 .quad sys_linkat
810 .quad sys_symlinkat
811 .quad sys_readlinkat /* 305 */
812 .quad sys_fchmodat
813 .quad sys_faccessat
814 .quad compat_sys_pselect6
815 .quad compat_sys_ppoll
816 .quad sys_unshare /* 310 */
817 .quad compat_sys_set_robust_list
818 .quad compat_sys_get_robust_list
819 .quad sys_splice
820 .quad sys32_sync_file_range
821 .quad sys_tee /* 315 */
822 .quad compat_sys_vmsplice
823 .quad compat_sys_move_pages
824 .quad sys_getcpu
825 .quad sys_epoll_pwait
826 .quad compat_sys_utimensat /* 320 */
827 .quad compat_sys_signalfd
828 .quad sys_timerfd_create
829 .quad sys_eventfd
830 .quad sys32_fallocate
831 .quad compat_sys_timerfd_settime /* 325 */
832 .quad compat_sys_timerfd_gettime
833 .quad compat_sys_signalfd4
834 .quad sys_eventfd2
835 .quad sys_epoll_create1
836 .quad sys_dup3 /* 330 */
837 .quad sys_pipe2
838 .quad sys_inotify_init1
839 .quad compat_sys_preadv
840 .quad compat_sys_pwritev
841 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
842 .quad sys_perf_event_open
843 .quad compat_sys_recvmmsg
844 .quad sys_fanotify_init
845 .quad sys32_fanotify_mark
846 .quad sys_prlimit64 /* 340 */
847 .quad sys_name_to_handle_at
848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime
850 .quad sys_syncfs
851 .quad compat_sys_sendmmsg /* 345 */
852 .quad sys_setns
853 .quad compat_sys_process_vm_readv
854 .quad compat_sys_process_vm_writev
855ia32_syscall_end:
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
new file mode 100644
index 000000000000..51ecd5b4e787
--- /dev/null
+++ b/arch/x86/ia32/nosyscall.c
@@ -0,0 +1,7 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3
4long compat_ni_syscall(void)
5{
6 return -ENOSYS;
7}
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
new file mode 100644
index 000000000000..4754ba0f5d9f
--- /dev/null
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -0,0 +1,25 @@
1/* System call table for ia32 emulation. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
13
14typedef void (*sys_call_ptr_t)(void);
15
16extern void compat_ni_syscall(void);
17
18const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 6fa90a845e4c..b57e6a43a37a 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -19,7 +19,8 @@ header-y += processor-flags.h
19header-y += ptrace-abi.h 19header-y += ptrace-abi.h
20header-y += sigcontext32.h 20header-y += sigcontext32.h
21header-y += ucontext.h 21header-y += ucontext.h
22header-y += unistd_32.h
23header-y += unistd_64.h
24header-y += vm86.h 22header-y += vm86.h
25header-y += vsyscall.h 23header-y += vsyscall.h
24
25genhdr-y += unistd_32.h
26genhdr-y += unistd_64.h
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 091508b533b4..952bd0100c5c 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -4,10 +4,10 @@
4 4
5#ifdef CONFIG_SMP 5#ifdef CONFIG_SMP
6 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
71: lock 7672: lock
8 .section .smp_locks,"a" 8 .section .smp_locks,"a"
9 .balign 4 9 .balign 4
10 .long 1b - . 10 .long 672b - .
11 .previous 11 .previous
12 .endm 12 .endm
13#else 13#else
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 8e41071704a5..49ad773f4b9f 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_AMD_NB_H 1#ifndef _ASM_X86_AMD_NB_H
2#define _ASM_X86_AMD_NB_H 2#define _ASM_X86_AMD_NB_H
3 3
4#include <linux/ioport.h>
4#include <linux/pci.h> 5#include <linux/pci.h>
5 6
6struct amd_nb_bus_dev_range { 7struct amd_nb_bus_dev_range {
@@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 14extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14 15
15extern bool early_is_amd_nb(u32 value); 16extern bool early_is_amd_nb(u32 value);
17extern struct resource *amd_get_mmconfig_range(struct resource *res);
16extern int amd_cache_northbridges(void); 18extern int amd_cache_northbridges(void);
17extern void amd_flush_garts(void); 19extern void amd_flush_garts(void);
18extern int amd_numa_init(void); 20extern int amd_numa_init(void);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1a6c09af048f..3ab9bdd87e79 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -176,6 +176,7 @@ static inline u64 native_x2apic_icr_read(void)
176} 176}
177 177
178extern int x2apic_phys; 178extern int x2apic_phys;
179extern int x2apic_preenabled;
179extern void check_x2apic(void); 180extern void check_x2apic(void);
180extern void enable_x2apic(void); 181extern void enable_x2apic(void);
181extern void x2apic_icr_write(u32 low, u32 id); 182extern void x2apic_icr_write(u32 low, u32 id);
@@ -198,6 +199,9 @@ static inline void x2apic_force_phys(void)
198 x2apic_phys = 1; 199 x2apic_phys = 1;
199} 200}
200#else 201#else
202static inline void disable_x2apic(void)
203{
204}
201static inline void check_x2apic(void) 205static inline void check_x2apic(void)
202{ 206{
203} 207}
@@ -212,6 +216,7 @@ static inline void x2apic_force_phys(void)
212{ 216{
213} 217}
214 218
219#define nox2apic 0
215#define x2apic_preenabled 0 220#define x2apic_preenabled 0
216#define x2apic_supported() 0 221#define x2apic_supported() 0
217#endif 222#endif
@@ -410,6 +415,7 @@ extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
410#endif 415#endif
411 416
412#ifdef CONFIG_X86_LOCAL_APIC 417#ifdef CONFIG_X86_LOCAL_APIC
418
413static inline u32 apic_read(u32 reg) 419static inline u32 apic_read(u32 reg)
414{ 420{
415 return apic->read(reg); 421 return apic->read(reg);
diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h
new file mode 100644
index 000000000000..a2d312796440
--- /dev/null
+++ b/arch/x86/include/asm/apic_flat_64.h
@@ -0,0 +1,7 @@
1#ifndef _ASM_X86_APIC_FLAT_64_H
2#define _ASM_X86_APIC_FLAT_64_H
3
4extern void flat_init_apic_ldr(void);
5
6#endif
7
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 3925d8007864..134bba00df09 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -144,6 +144,7 @@
144 144
145#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) 145#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
146#define APIC_BASE_MSR 0x800 146#define APIC_BASE_MSR 0x800
147#define XAPIC_ENABLE (1UL << 11)
147#define X2APIC_ENABLE (1UL << 10) 148#define X2APIC_ENABLE (1UL << 10)
148 149
149#ifdef CONFIG_X86_32 150#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 24098aafce0d..fa13f0ec2874 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -82,7 +82,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
82 * 82 *
83 * Atomically reads the value of @v and returns it. 83 * Atomically reads the value of @v and returns it.
84 */ 84 */
85static inline long long atomic64_read(atomic64_t *v) 85static inline long long atomic64_read(const atomic64_t *v)
86{ 86{
87 long long r; 87 long long r;
88 asm volatile(ATOMIC64_ALTERNATIVE(read) 88 asm volatile(ATOMIC64_ALTERNATIVE(read)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 1775d6e5920e..b97596e2b68c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -380,6 +380,8 @@ static inline unsigned long __fls(unsigned long word)
380 return word; 380 return word;
381} 381}
382 382
383#undef ADDR
384
383#ifdef __KERNEL__ 385#ifdef __KERNEL__
384/** 386/**
385 * ffs - find first set bit in word 387 * ffs - find first set bit in word
@@ -395,10 +397,25 @@ static inline unsigned long __fls(unsigned long word)
395static inline int ffs(int x) 397static inline int ffs(int x)
396{ 398{
397 int r; 399 int r;
398#ifdef CONFIG_X86_CMOV 400
401#ifdef CONFIG_X86_64
402 /*
403 * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
404 * dest reg is undefined if x==0, but their CPU architect says its
405 * value is written to set it to the same as before, except that the
406 * top 32 bits will be cleared.
407 *
408 * We cannot do this on 32 bits because at the very least some
409 * 486 CPUs did not behave this way.
410 */
411 long tmp = -1;
412 asm("bsfl %1,%0"
413 : "=r" (r)
414 : "rm" (x), "0" (tmp));
415#elif defined(CONFIG_X86_CMOV)
399 asm("bsfl %1,%0\n\t" 416 asm("bsfl %1,%0\n\t"
400 "cmovzl %2,%0" 417 "cmovzl %2,%0"
401 : "=r" (r) : "rm" (x), "r" (-1)); 418 : "=&r" (r) : "rm" (x), "r" (-1));
402#else 419#else
403 asm("bsfl %1,%0\n\t" 420 asm("bsfl %1,%0\n\t"
404 "jnz 1f\n\t" 421 "jnz 1f\n\t"
@@ -422,7 +439,22 @@ static inline int ffs(int x)
422static inline int fls(int x) 439static inline int fls(int x)
423{ 440{
424 int r; 441 int r;
425#ifdef CONFIG_X86_CMOV 442
443#ifdef CONFIG_X86_64
444 /*
445 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
446 * dest reg is undefined if x==0, but their CPU architect says its
447 * value is written to set it to the same as before, except that the
448 * top 32 bits will be cleared.
449 *
450 * We cannot do this on 32 bits because at the very least some
451 * 486 CPUs did not behave this way.
452 */
453 long tmp = -1;
454 asm("bsrl %1,%0"
455 : "=r" (r)
456 : "rm" (x), "0" (tmp));
457#elif defined(CONFIG_X86_CMOV)
426 asm("bsrl %1,%0\n\t" 458 asm("bsrl %1,%0\n\t"
427 "cmovzl %2,%0" 459 "cmovzl %2,%0"
428 : "=&r" (r) : "rm" (x), "rm" (-1)); 460 : "=&r" (r) : "rm" (x), "rm" (-1));
@@ -434,11 +466,35 @@ static inline int fls(int x)
434#endif 466#endif
435 return r + 1; 467 return r + 1;
436} 468}
437#endif /* __KERNEL__ */
438
439#undef ADDR
440 469
441#ifdef __KERNEL__ 470/**
471 * fls64 - find last set bit in a 64-bit word
472 * @x: the word to search
473 *
474 * This is defined in a similar way as the libc and compiler builtin
475 * ffsll, but returns the position of the most significant set bit.
476 *
477 * fls64(value) returns 0 if value is 0 or the position of the last
478 * set bit if value is nonzero. The last (most significant) bit is
479 * at position 64.
480 */
481#ifdef CONFIG_X86_64
482static __always_inline int fls64(__u64 x)
483{
484 long bitpos = -1;
485 /*
486 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
487 * dest reg is undefined if x==0, but their CPU architect says its
488 * value is written to set it to the same as before.
489 */
490 asm("bsrq %1,%0"
491 : "+r" (bitpos)
492 : "rm" (x));
493 return bitpos + 1;
494}
495#else
496#include <asm-generic/bitops/fls64.h>
497#endif
442 498
443#include <asm-generic/bitops/find.h> 499#include <asm-generic/bitops/find.h>
444 500
@@ -450,12 +506,6 @@ static inline int fls(int x)
450 506
451#include <asm-generic/bitops/const_hweight.h> 507#include <asm-generic/bitops/const_hweight.h>
452 508
453#endif /* __KERNEL__ */
454
455#include <asm-generic/bitops/fls64.h>
456
457#ifdef __KERNEL__
458
459#include <asm-generic/bitops/le.h> 509#include <asm-generic/bitops/le.h>
460 510
461#include <asm-generic/bitops/ext2-atomic-setbit.h> 511#include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index e020d88ec02d..2f90c51cc49d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -64,6 +64,8 @@ struct setup_header {
64 __u32 payload_offset; 64 __u32 payload_offset;
65 __u32 payload_length; 65 __u32 payload_length;
66 __u64 setup_data; 66 __u64 setup_data;
67 __u64 pref_address;
68 __u32 init_size;
67} __attribute__((packed)); 69} __attribute__((packed));
68 70
69struct sys_desc_table { 71struct sys_desc_table {
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 5d3acdf5a7a6..0c9fa2745f13 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -14,6 +14,8 @@ extern void __cmpxchg_wrong_size(void)
14 __compiletime_error("Bad argument size for cmpxchg"); 14 __compiletime_error("Bad argument size for cmpxchg");
15extern void __xadd_wrong_size(void) 15extern void __xadd_wrong_size(void)
16 __compiletime_error("Bad argument size for xadd"); 16 __compiletime_error("Bad argument size for xadd");
17extern void __add_wrong_size(void)
18 __compiletime_error("Bad argument size for add");
17 19
18/* 20/*
19 * Constants for operation sizes. On 32-bit, the 64-bit size it set to 21 * Constants for operation sizes. On 32-bit, the 64-bit size it set to
@@ -31,60 +33,47 @@ extern void __xadd_wrong_size(void)
31#define __X86_CASE_Q -1 /* sizeof will never return -1 */ 33#define __X86_CASE_Q -1 /* sizeof will never return -1 */
32#endif 34#endif
33 35
36/*
37 * An exchange-type operation, which takes a value and a pointer, and
38 * returns a the old value.
39 */
40#define __xchg_op(ptr, arg, op, lock) \
41 ({ \
42 __typeof__ (*(ptr)) __ret = (arg); \
43 switch (sizeof(*(ptr))) { \
44 case __X86_CASE_B: \
45 asm volatile (lock #op "b %b0, %1\n" \
46 : "+r" (__ret), "+m" (*(ptr)) \
47 : : "memory", "cc"); \
48 break; \
49 case __X86_CASE_W: \
50 asm volatile (lock #op "w %w0, %1\n" \
51 : "+r" (__ret), "+m" (*(ptr)) \
52 : : "memory", "cc"); \
53 break; \
54 case __X86_CASE_L: \
55 asm volatile (lock #op "l %0, %1\n" \
56 : "+r" (__ret), "+m" (*(ptr)) \
57 : : "memory", "cc"); \
58 break; \
59 case __X86_CASE_Q: \
60 asm volatile (lock #op "q %q0, %1\n" \
61 : "+r" (__ret), "+m" (*(ptr)) \
62 : : "memory", "cc"); \
63 break; \
64 default: \
65 __ ## op ## _wrong_size(); \
66 } \
67 __ret; \
68 })
69
34/* 70/*
35 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. 71 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
36 * Since this is generally used to protect other memory information, we 72 * Since this is generally used to protect other memory information, we
37 * use "asm volatile" and "memory" clobbers to prevent gcc from moving 73 * use "asm volatile" and "memory" clobbers to prevent gcc from moving
38 * information around. 74 * information around.
39 */ 75 */
40#define __xchg(x, ptr, size) \ 76#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
41({ \
42 __typeof(*(ptr)) __x = (x); \
43 switch (size) { \
44 case __X86_CASE_B: \
45 { \
46 volatile u8 *__ptr = (volatile u8 *)(ptr); \
47 asm volatile("xchgb %0,%1" \
48 : "=q" (__x), "+m" (*__ptr) \
49 : "0" (__x) \
50 : "memory"); \
51 break; \
52 } \
53 case __X86_CASE_W: \
54 { \
55 volatile u16 *__ptr = (volatile u16 *)(ptr); \
56 asm volatile("xchgw %0,%1" \
57 : "=r" (__x), "+m" (*__ptr) \
58 : "0" (__x) \
59 : "memory"); \
60 break; \
61 } \
62 case __X86_CASE_L: \
63 { \
64 volatile u32 *__ptr = (volatile u32 *)(ptr); \
65 asm volatile("xchgl %0,%1" \
66 : "=r" (__x), "+m" (*__ptr) \
67 : "0" (__x) \
68 : "memory"); \
69 break; \
70 } \
71 case __X86_CASE_Q: \
72 { \
73 volatile u64 *__ptr = (volatile u64 *)(ptr); \
74 asm volatile("xchgq %0,%1" \
75 : "=r" (__x), "+m" (*__ptr) \
76 : "0" (__x) \
77 : "memory"); \
78 break; \
79 } \
80 default: \
81 __xchg_wrong_size(); \
82 } \
83 __x; \
84})
85
86#define xchg(ptr, v) \
87 __xchg((v), (ptr), sizeof(*ptr))
88 77
89/* 78/*
90 * Atomic compare and exchange. Compare OLD with MEM, if identical, 79 * Atomic compare and exchange. Compare OLD with MEM, if identical,
@@ -165,46 +154,80 @@ extern void __xadd_wrong_size(void)
165 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) 154 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
166#endif 155#endif
167 156
168#define __xadd(ptr, inc, lock) \ 157/*
158 * xadd() adds "inc" to "*ptr" and atomically returns the previous
159 * value of "*ptr".
160 *
161 * xadd() is locked when multiple CPUs are online
162 * xadd_sync() is always locked
163 * xadd_local() is never locked
164 */
165#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
166#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
167#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ")
168#define xadd_local(ptr, inc) __xadd((ptr), (inc), "")
169
170#define __add(ptr, inc, lock) \
169 ({ \ 171 ({ \
170 __typeof__ (*(ptr)) __ret = (inc); \ 172 __typeof__ (*(ptr)) __ret = (inc); \
171 switch (sizeof(*(ptr))) { \ 173 switch (sizeof(*(ptr))) { \
172 case __X86_CASE_B: \ 174 case __X86_CASE_B: \
173 asm volatile (lock "xaddb %b0, %1\n" \ 175 asm volatile (lock "addb %b1, %0\n" \
174 : "+r" (__ret), "+m" (*(ptr)) \ 176 : "+m" (*(ptr)) : "ri" (inc) \
175 : : "memory", "cc"); \ 177 : "memory", "cc"); \
176 break; \ 178 break; \
177 case __X86_CASE_W: \ 179 case __X86_CASE_W: \
178 asm volatile (lock "xaddw %w0, %1\n" \ 180 asm volatile (lock "addw %w1, %0\n" \
179 : "+r" (__ret), "+m" (*(ptr)) \ 181 : "+m" (*(ptr)) : "ri" (inc) \
180 : : "memory", "cc"); \ 182 : "memory", "cc"); \
181 break; \ 183 break; \
182 case __X86_CASE_L: \ 184 case __X86_CASE_L: \
183 asm volatile (lock "xaddl %0, %1\n" \ 185 asm volatile (lock "addl %1, %0\n" \
184 : "+r" (__ret), "+m" (*(ptr)) \ 186 : "+m" (*(ptr)) : "ri" (inc) \
185 : : "memory", "cc"); \ 187 : "memory", "cc"); \
186 break; \ 188 break; \
187 case __X86_CASE_Q: \ 189 case __X86_CASE_Q: \
188 asm volatile (lock "xaddq %q0, %1\n" \ 190 asm volatile (lock "addq %1, %0\n" \
189 : "+r" (__ret), "+m" (*(ptr)) \ 191 : "+m" (*(ptr)) : "ri" (inc) \
190 : : "memory", "cc"); \ 192 : "memory", "cc"); \
191 break; \ 193 break; \
192 default: \ 194 default: \
193 __xadd_wrong_size(); \ 195 __add_wrong_size(); \
194 } \ 196 } \
195 __ret; \ 197 __ret; \
196 }) 198 })
197 199
198/* 200/*
199 * xadd() adds "inc" to "*ptr" and atomically returns the previous 201 * add_*() adds "inc" to "*ptr"
200 * value of "*ptr".
201 * 202 *
202 * xadd() is locked when multiple CPUs are online 203 * __add() takes a lock prefix
203 * xadd_sync() is always locked 204 * add_smp() is locked when multiple CPUs are online
204 * xadd_local() is never locked 205 * add_sync() is always locked
205 */ 206 */
206#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) 207#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX)
207#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") 208#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ")
208#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") 209
210#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \
211({ \
212 bool __ret; \
213 __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \
214 __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \
215 BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \
216 BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \
217 VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \
218 VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \
219 asm volatile(pfx "cmpxchg%c4b %2; sete %0" \
220 : "=a" (__ret), "+d" (__old2), \
221 "+m" (*(p1)), "+m" (*(p2)) \
222 : "i" (2 * sizeof(long)), "a" (__old1), \
223 "b" (__new1), "c" (__new2)); \
224 __ret; \
225})
226
227#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \
228 __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2)
229
230#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \
231 __cmpxchg_double(, p1, p2, o1, o2, n1, n2)
209 232
210#endif /* ASM_X86_CMPXCHG_H */ 233#endif /* ASM_X86_CMPXCHG_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index fbebb07dd80b..53f4b219336b 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -166,52 +166,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
166 166
167#endif 167#endif
168 168
169#define cmpxchg8b(ptr, o1, o2, n1, n2) \
170({ \
171 char __ret; \
172 __typeof__(o2) __dummy; \
173 __typeof__(*(ptr)) __old1 = (o1); \
174 __typeof__(o2) __old2 = (o2); \
175 __typeof__(*(ptr)) __new1 = (n1); \
176 __typeof__(o2) __new2 = (n2); \
177 asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \
178 : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
179 : "a" (__old1), "d"(__old2), \
180 "b" (__new1), "c" (__new2) \
181 : "memory"); \
182 __ret; })
183
184
185#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \
186({ \
187 char __ret; \
188 __typeof__(o2) __dummy; \
189 __typeof__(*(ptr)) __old1 = (o1); \
190 __typeof__(o2) __old2 = (o2); \
191 __typeof__(*(ptr)) __new1 = (n1); \
192 __typeof__(o2) __new2 = (n2); \
193 asm volatile("cmpxchg8b %2; setz %1" \
194 : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
195 : "a" (__old), "d"(__old2), \
196 "b" (__new1), "c" (__new2), \
197 : "memory"); \
198 __ret; })
199
200
201#define cmpxchg_double(ptr, o1, o2, n1, n2) \
202({ \
203 BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
204 VM_BUG_ON((unsigned long)(ptr) % 8); \
205 cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \
206})
207
208#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
209({ \
210 BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
211 VM_BUG_ON((unsigned long)(ptr) % 8); \
212 cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
213})
214
215#define system_has_cmpxchg_double() cpu_has_cx8 169#define system_has_cmpxchg_double() cpu_has_cx8
216 170
217#endif /* _ASM_X86_CMPXCHG_32_H */ 171#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 285da02c38fa..614be87f1a9b 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,49 +20,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
20 cmpxchg_local((ptr), (o), (n)); \ 20 cmpxchg_local((ptr), (o), (n)); \
21}) 21})
22 22
23#define cmpxchg16b(ptr, o1, o2, n1, n2) \
24({ \
25 char __ret; \
26 __typeof__(o2) __junk; \
27 __typeof__(*(ptr)) __old1 = (o1); \
28 __typeof__(o2) __old2 = (o2); \
29 __typeof__(*(ptr)) __new1 = (n1); \
30 __typeof__(o2) __new2 = (n2); \
31 asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \
32 : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
33 : "b"(__new1), "c"(__new2), \
34 "a"(__old1), "d"(__old2)); \
35 __ret; })
36
37
38#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \
39({ \
40 char __ret; \
41 __typeof__(o2) __junk; \
42 __typeof__(*(ptr)) __old1 = (o1); \
43 __typeof__(o2) __old2 = (o2); \
44 __typeof__(*(ptr)) __new1 = (n1); \
45 __typeof__(o2) __new2 = (n2); \
46 asm volatile("cmpxchg16b %2;setz %1" \
47 : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
48 : "b"(__new1), "c"(__new2), \
49 "a"(__old1), "d"(__old2)); \
50 __ret; })
51
52#define cmpxchg_double(ptr, o1, o2, n1, n2) \
53({ \
54 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
55 VM_BUG_ON((unsigned long)(ptr) % 16); \
56 cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \
57})
58
59#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
60({ \
61 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
62 VM_BUG_ON((unsigned long)(ptr) % 16); \
63 cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
64})
65
66#define system_has_cmpxchg_double() cpu_has_cx16 23#define system_has_cmpxchg_double() cpu_has_cx16
67 24
68#endif /* _ASM_X86_CMPXCHG_64_H */ 25#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f3444f700f36..17c5d4bdee5e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -197,7 +197,10 @@
197 197
198/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 198/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
199#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 199#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
200#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
201#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
200#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ 202#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
203#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */
201#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ 204#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
202 205
203#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 206#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 078ad0caefc6..b903d5ea3941 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump);
101 101
102extern void hw_breakpoint_restore(void); 102extern void hw_breakpoint_restore(void);
103 103
104#ifdef CONFIG_X86_64
105DECLARE_PER_CPU(int, debug_stack_usage);
106static inline void debug_stack_usage_inc(void)
107{
108 __get_cpu_var(debug_stack_usage)++;
109}
110static inline void debug_stack_usage_dec(void)
111{
112 __get_cpu_var(debug_stack_usage)--;
113}
114int is_debug_stack(unsigned long addr);
115void debug_stack_set_zero(void);
116void debug_stack_reset(void);
117#else /* !X86_64 */
118static inline int is_debug_stack(unsigned long addr) { return 0; }
119static inline void debug_stack_set_zero(void) { }
120static inline void debug_stack_reset(void) { }
121static inline void debug_stack_usage_inc(void) { }
122static inline void debug_stack_usage_dec(void) { }
123#endif /* X86_64 */
124
125
104#endif /* __KERNEL__ */ 126#endif /* __KERNEL__ */
105 127
106#endif /* _ASM_X86_DEBUGREG_H */ 128#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 41935fadfdfc..e95822d683f4 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
35 35
36extern struct desc_ptr idt_descr; 36extern struct desc_ptr idt_descr;
37extern gate_desc idt_table[]; 37extern gate_desc idt_table[];
38extern struct desc_ptr nmi_idt_descr;
39extern gate_desc nmi_idt_table[];
38 40
39struct gdt_page { 41struct gdt_page {
40 struct desc_struct gdt[GDT_ENTRIES]; 42 struct desc_struct gdt[GDT_ENTRIES];
@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
307 desc->limit = (limit >> 16) & 0xf; 309 desc->limit = (limit >> 16) & 0xf;
308} 310}
309 311
312#ifdef CONFIG_X86_64
313static inline void set_nmi_gate(int gate, void *addr)
314{
315 gate_desc s;
316
317 pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
318 write_idt_entry(nmi_idt_table, gate, &s);
319}
320#endif
321
310static inline void _set_gate(int gate, unsigned type, void *addr, 322static inline void _set_gate(int gate, unsigned type, void *addr,
311 unsigned dpl, unsigned ist, unsigned seg) 323 unsigned dpl, unsigned ist, unsigned seg)
312{ 324{
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9a2d644c08ef..ced283ac79df 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -4,6 +4,7 @@
4#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
5 5
6#include <linux/types.h> 6#include <linux/types.h>
7#include <linux/log2.h>
7 8
8/* 9/*
9 * do_div() is NOT a C function. It wants to return 10 * do_div() is NOT a C function. It wants to return
@@ -21,15 +22,20 @@
21({ \ 22({ \
22 unsigned long __upper, __low, __high, __mod, __base; \ 23 unsigned long __upper, __low, __high, __mod, __base; \
23 __base = (base); \ 24 __base = (base); \
24 asm("":"=a" (__low), "=d" (__high) : "A" (n)); \ 25 if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
25 __upper = __high; \ 26 __mod = n & (__base - 1); \
26 if (__high) { \ 27 n >>= ilog2(__base); \
27 __upper = __high % (__base); \ 28 } else { \
28 __high = __high / (__base); \ 29 asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
30 __upper = __high; \
31 if (__high) { \
32 __upper = __high % (__base); \
33 __high = __high / (__base); \
34 } \
35 asm("divl %2" : "=a" (__low), "=d" (__mod) \
36 : "rm" (__base), "0" (__low), "1" (__upper)); \
37 asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
29 } \ 38 } \
30 asm("divl %2":"=a" (__low), "=d" (__mod) \
31 : "rm" (__base), "0" (__low), "1" (__upper)); \
32 asm("":"=A" (n) : "a" (__low), "d" (__high)); \
33 __mod; \ 39 __mod; \
34}) 40})
35 41
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 7093e4a6a0bc..844f735fd63a 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,8 @@
3 3
4#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
5 5
6#define EFI_LOADER_SIGNATURE "EL32"
7
6extern unsigned long asmlinkage efi_call_phys(void *, ...); 8extern unsigned long asmlinkage efi_call_phys(void *, ...);
7 9
8#define efi_call_phys0(f) efi_call_phys(f) 10#define efi_call_phys0(f) efi_call_phys(f)
@@ -37,6 +39,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
37 39
38#else /* !CONFIG_X86_32 */ 40#else /* !CONFIG_X86_32 */
39 41
42#define EFI_LOADER_SIGNATURE "EL64"
43
40extern u64 efi_call0(void *fp); 44extern u64 efi_call0(void *fp);
41extern u64 efi_call1(void *fp, u64 arg1); 45extern u64 efi_call1(void *fp, u64 arg1);
42extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); 46extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 460c74e4852c..4da3c0c4c974 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,7 +117,7 @@ enum fixed_addresses {
117#endif 117#endif
118 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ 118 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
119 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ 119 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
120#ifdef CONFIG_X86_MRST 120#ifdef CONFIG_X86_INTEL_MID
121 FIX_LNW_VRTC, 121 FIX_LNW_VRTC,
122#endif 122#endif
123 __end_of_permanent_fixed_addresses, 123 __end_of_permanent_fixed_addresses,
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 55e4de613f0e..da0b3ca815b7 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -11,6 +11,7 @@ typedef struct {
11#ifdef CONFIG_X86_LOCAL_APIC 11#ifdef CONFIG_X86_LOCAL_APIC
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14 unsigned int icr_read_retry_count;
14#endif 15#endif
15 unsigned int x86_platform_ipis; /* arch dependent */ 16 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 17 unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c9e09ea05644..6919e936345b 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
218#ifdef CONFIG_SMP 218#ifdef CONFIG_SMP
219#define safe_address (__per_cpu_offset[0]) 219#define safe_address (__per_cpu_offset[0])
220#else 220#else
221#define safe_address (kstat_cpu(0).cpustat.user) 221#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
222#endif 222#endif
223 223
224/* 224/*
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
index 976f6ecd2ce6..b0d5716ca1e4 100644
--- a/arch/x86/include/asm/ia32_unistd.h
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -2,17 +2,10 @@
2#define _ASM_X86_IA32_UNISTD_H 2#define _ASM_X86_IA32_UNISTD_H
3 3
4/* 4/*
5 * This file contains the system call numbers of the ia32 port, 5 * This file contains the system call numbers of the ia32 compat ABI,
6 * this is for the kernel only. 6 * this is for the kernel only.
7 * Only add syscalls here where some part of the kernel needs to know
8 * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
9 */ 7 */
10 8#define __SYSCALL_ia32_NR(x) (x)
11#define __NR_ia32_restart_syscall 0 9#include <asm/unistd_32_ia32.h>
12#define __NR_ia32_exit 1
13#define __NR_ia32_read 3
14#define __NR_ia32_write 4
15#define __NR_ia32_sigreturn 119
16#define __NR_ia32_rt_sigreturn 173
17 10
18#endif /* _ASM_X86_IA32_UNISTD_H */ 11#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8dbe353e41e1..adcc0ae73d09 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,6 +5,8 @@
5extern void __init early_ioremap_page_table_range_init(void); 5extern void __init early_ioremap_page_table_range_init(void);
6#endif 6#endif
7 7
8extern void __init zone_sizes_init(void);
9
8extern unsigned long __init 10extern unsigned long __init
9kernel_physical_mapping_init(unsigned long start, 11kernel_physical_mapping_init(unsigned long start,
10 unsigned long end, 12 unsigned long end,
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 88c765e16410..74df3f1eddfd 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -137,6 +137,13 @@ static inline int insn_is_avx(struct insn *insn)
137 return (insn->vex_prefix.value != 0); 137 return (insn->vex_prefix.value != 0);
138} 138}
139 139
140/* Ensure this instruction is decoded completely */
141static inline int insn_complete(struct insn *insn)
142{
143 return insn->opcode.got && insn->modrm.got && insn->sib.got &&
144 insn->displacement.got && insn->immediate.got;
145}
146
140static inline insn_byte_t insn_vex_m_bits(struct insn *insn) 147static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
141{ 148{
142 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ 149 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4420993acc47..925b605eb5c6 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -3,11 +3,15 @@
3 3
4#include <linux/notifier.h> 4#include <linux/notifier.h>
5 5
6#define IPCMSG_VRTC 0xFA /* Set vRTC device */ 6#define IPCMSG_WARM_RESET 0xF0
7 7#define IPCMSG_COLD_RESET 0xF1
8/* Command id associated with message IPCMSG_VRTC */ 8#define IPCMSG_SOFT_RESET 0xF2
9#define IPC_CMD_VRTC_SETTIME 1 /* Set time */ 9#define IPCMSG_COLD_BOOT 0xF3
10#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ 10
11#define IPCMSG_VRTC 0xFA /* Set vRTC device */
12 /* Command id associated with message IPCMSG_VRTC */
13 #define IPC_CMD_VRTC_SETTIME 1 /* Set time */
14 #define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
11 15
12/* Read single register */ 16/* Read single register */
13int intel_scu_ipc_ioread8(u16 addr, u8 *data); 17int intel_scu_ipc_ioread8(u16 addr, u8 *data);
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 345c99cef152..dffc38ee6255 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops;
5extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
6extern int iommu_detected; 6extern int iommu_detected;
7extern int iommu_pass_through; 7extern int iommu_pass_through;
8extern int iommu_group_mf;
8 9
9/* 10 seconds */ 10/* 10 seconds */
10#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a026507893e9..ab4092e3214e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -181,6 +181,7 @@ struct x86_emulate_ops {
181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); 181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
182 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); 182 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
183 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); 183 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
184 int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
184 void (*halt)(struct x86_emulate_ctxt *ctxt); 185 void (*halt)(struct x86_emulate_ctxt *ctxt);
185 void (*wbinvd)(struct x86_emulate_ctxt *ctxt); 186 void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
186 int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); 187 int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
@@ -364,6 +365,7 @@ enum x86_intercept {
364#endif 365#endif
365 366
366int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); 367int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
368bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
367#define EMULATION_FAILED -1 369#define EMULATION_FAILED -1
368#define EMULATION_OK 0 370#define EMULATION_OK 0
369#define EMULATION_RESTART 1 371#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4973f4dab98..52d6640a5ca1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,10 +16,12 @@
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h> 17#include <linux/tracepoint.h>
18#include <linux/cpumask.h> 18#include <linux/cpumask.h>
19#include <linux/irq_work.h>
19 20
20#include <linux/kvm.h> 21#include <linux/kvm.h>
21#include <linux/kvm_para.h> 22#include <linux/kvm_para.h>
22#include <linux/kvm_types.h> 23#include <linux/kvm_types.h>
24#include <linux/perf_event.h>
23 25
24#include <asm/pvclock-abi.h> 26#include <asm/pvclock-abi.h>
25#include <asm/desc.h> 27#include <asm/desc.h>
@@ -31,6 +33,8 @@
31#define KVM_MEMORY_SLOTS 32 33#define KVM_MEMORY_SLOTS 32
32/* memory slots that does not exposed to userspace */ 34/* memory slots that does not exposed to userspace */
33#define KVM_PRIVATE_MEM_SLOTS 4 35#define KVM_PRIVATE_MEM_SLOTS 4
36#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
37
34#define KVM_MMIO_SIZE 16 38#define KVM_MMIO_SIZE 16
35 39
36#define KVM_PIO_PAGE_OFFSET 1 40#define KVM_PIO_PAGE_OFFSET 1
@@ -228,7 +232,7 @@ struct kvm_mmu_page {
228 * One bit set per slot which has memory 232 * One bit set per slot which has memory
229 * in this shadow page. 233 * in this shadow page.
230 */ 234 */
231 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 235 DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
232 bool unsync; 236 bool unsync;
233 int root_count; /* Currently serving as active root */ 237 int root_count; /* Currently serving as active root */
234 unsigned int unsync_children; 238 unsigned int unsync_children;
@@ -239,14 +243,9 @@ struct kvm_mmu_page {
239 int clear_spte_count; 243 int clear_spte_count;
240#endif 244#endif
241 245
242 struct rcu_head rcu; 246 int write_flooding_count;
243};
244 247
245struct kvm_pv_mmu_op_buffer { 248 struct rcu_head rcu;
246 void *ptr;
247 unsigned len;
248 unsigned processed;
249 char buf[512] __aligned(sizeof(long));
250}; 249};
251 250
252struct kvm_pio_request { 251struct kvm_pio_request {
@@ -294,6 +293,37 @@ struct kvm_mmu {
294 u64 pdptrs[4]; /* pae */ 293 u64 pdptrs[4]; /* pae */
295}; 294};
296 295
296enum pmc_type {
297 KVM_PMC_GP = 0,
298 KVM_PMC_FIXED,
299};
300
301struct kvm_pmc {
302 enum pmc_type type;
303 u8 idx;
304 u64 counter;
305 u64 eventsel;
306 struct perf_event *perf_event;
307 struct kvm_vcpu *vcpu;
308};
309
310struct kvm_pmu {
311 unsigned nr_arch_gp_counters;
312 unsigned nr_arch_fixed_counters;
313 unsigned available_event_types;
314 u64 fixed_ctr_ctrl;
315 u64 global_ctrl;
316 u64 global_status;
317 u64 global_ovf_ctrl;
318 u64 counter_bitmask[2];
319 u64 global_ctrl_mask;
320 u8 version;
321 struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC];
322 struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED];
323 struct irq_work irq_work;
324 u64 reprogram_pmi;
325};
326
297struct kvm_vcpu_arch { 327struct kvm_vcpu_arch {
298 /* 328 /*
299 * rip and regs accesses must go through 329 * rip and regs accesses must go through
@@ -345,19 +375,10 @@ struct kvm_vcpu_arch {
345 */ 375 */
346 struct kvm_mmu *walk_mmu; 376 struct kvm_mmu *walk_mmu;
347 377
348 /* only needed in kvm_pv_mmu_op() path, but it's hot so
349 * put it here to avoid allocation */
350 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
351
352 struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; 378 struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
353 struct kvm_mmu_memory_cache mmu_page_cache; 379 struct kvm_mmu_memory_cache mmu_page_cache;
354 struct kvm_mmu_memory_cache mmu_page_header_cache; 380 struct kvm_mmu_memory_cache mmu_page_header_cache;
355 381
356 gfn_t last_pt_write_gfn;
357 int last_pt_write_count;
358 u64 *last_pte_updated;
359 gfn_t last_pte_gfn;
360
361 struct fpu guest_fpu; 382 struct fpu guest_fpu;
362 u64 xcr0; 383 u64 xcr0;
363 384
@@ -436,6 +457,8 @@ struct kvm_vcpu_arch {
436 unsigned access; 457 unsigned access;
437 gfn_t mmio_gfn; 458 gfn_t mmio_gfn;
438 459
460 struct kvm_pmu pmu;
461
439 /* used for guest single stepping over the given code position */ 462 /* used for guest single stepping over the given code position */
440 unsigned long singlestep_rip; 463 unsigned long singlestep_rip;
441 464
@@ -444,6 +467,9 @@ struct kvm_vcpu_arch {
444 467
445 cpumask_var_t wbinvd_dirty_mask; 468 cpumask_var_t wbinvd_dirty_mask;
446 469
470 unsigned long last_retry_eip;
471 unsigned long last_retry_addr;
472
447 struct { 473 struct {
448 bool halted; 474 bool halted;
449 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; 475 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
@@ -459,7 +485,6 @@ struct kvm_arch {
459 unsigned int n_requested_mmu_pages; 485 unsigned int n_requested_mmu_pages;
460 unsigned int n_max_mmu_pages; 486 unsigned int n_max_mmu_pages;
461 unsigned int indirect_shadow_pages; 487 unsigned int indirect_shadow_pages;
462 atomic_t invlpg_counter;
463 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 488 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
464 /* 489 /*
465 * Hash table of struct kvm_mmu_page. 490 * Hash table of struct kvm_mmu_page.
@@ -660,6 +685,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
660 685
661int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 686int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
662void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 687void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
688int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
689 struct kvm_memory_slot *slot);
663void kvm_mmu_zap_all(struct kvm *kvm); 690void kvm_mmu_zap_all(struct kvm *kvm);
664unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 691unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
665void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 692void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
@@ -668,8 +695,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
668 695
669int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 696int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
670 const void *val, int bytes); 697 const void *val, int bytes);
671int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
672 gpa_t addr, unsigned long *ret);
673u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); 698u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
674 699
675extern bool tdp_enabled; 700extern bool tdp_enabled;
@@ -692,6 +717,7 @@ enum emulation_result {
692#define EMULTYPE_NO_DECODE (1 << 0) 717#define EMULTYPE_NO_DECODE (1 << 0)
693#define EMULTYPE_TRAP_UD (1 << 1) 718#define EMULTYPE_TRAP_UD (1 << 1)
694#define EMULTYPE_SKIP (1 << 2) 719#define EMULTYPE_SKIP (1 << 2)
720#define EMULTYPE_RETRY (1 << 3)
695int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, 721int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
696 int emulation_type, void *insn, int insn_len); 722 int emulation_type, void *insn, int insn_len);
697 723
@@ -734,6 +760,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
734 760
735unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); 761unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
736void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 762void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
763bool kvm_rdpmc(struct kvm_vcpu *vcpu);
737 764
738void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 765void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
739void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 766void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -754,13 +781,14 @@ int fx_init(struct kvm_vcpu *vcpu);
754 781
755void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 782void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
756void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 783void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
757 const u8 *new, int bytes, 784 const u8 *new, int bytes);
758 bool guest_initiated); 785int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
759int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 786int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
760void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); 787void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
761int kvm_mmu_load(struct kvm_vcpu *vcpu); 788int kvm_mmu_load(struct kvm_vcpu *vcpu);
762void kvm_mmu_unload(struct kvm_vcpu *vcpu); 789void kvm_mmu_unload(struct kvm_vcpu *vcpu);
763void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 790void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
791gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
764gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, 792gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
765 struct x86_exception *exception); 793 struct x86_exception *exception);
766gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, 794gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -782,6 +810,11 @@ void kvm_disable_tdp(void);
782int complete_pio(struct kvm_vcpu *vcpu); 810int complete_pio(struct kvm_vcpu *vcpu);
783bool kvm_check_iopl(struct kvm_vcpu *vcpu); 811bool kvm_check_iopl(struct kvm_vcpu *vcpu);
784 812
813static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
814{
815 return gpa;
816}
817
785static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 818static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
786{ 819{
787 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 820 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -894,4 +927,17 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
894 927
895void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); 928void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
896 929
930int kvm_is_in_guest(void);
931
932void kvm_pmu_init(struct kvm_vcpu *vcpu);
933void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
934void kvm_pmu_reset(struct kvm_vcpu *vcpu);
935void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
936bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
937int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
938int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
939int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
940void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
941void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
942
897#endif /* _ASM_X86_KVM_HOST_H */ 943#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mach_timer.h b/arch/x86/include/asm/mach_timer.h
index 853728519ae9..88d0c3c74c13 100644
--- a/arch/x86/include/asm/mach_timer.h
+++ b/arch/x86/include/asm/mach_timer.h
@@ -15,7 +15,7 @@
15 15
16#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ 16#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
17#define CALIBRATE_LATCH \ 17#define CALIBRATE_LATCH \
18 ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) 18 ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
19 19
20static inline void mach_prepare_counter(void) 20static inline void mach_prepare_counter(void)
21{ 21{
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 01fdf5674e24..0e8e85bb7c51 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -81,8 +81,8 @@ static inline unsigned char current_lock_cmos_reg(void)
81#else 81#else
82#define lock_cmos_prefix(reg) do {} while (0) 82#define lock_cmos_prefix(reg) do {} while (0)
83#define lock_cmos_suffix(reg) do {} while (0) 83#define lock_cmos_suffix(reg) do {} while (0)
84#define lock_cmos(reg) 84#define lock_cmos(reg) do { } while (0)
85#define unlock_cmos() 85#define unlock_cmos() do { } while (0)
86#define do_i_have_lock_cmos() 0 86#define do_i_have_lock_cmos() 0
87#define current_lock_cmos_reg() 0 87#define current_lock_cmos_reg() 0
88#endif 88#endif
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 0e8ae57d3656..6aefb14cbbc5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -50,10 +50,11 @@
50#define MCJ_CTX_MASK 3 50#define MCJ_CTX_MASK 3
51#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) 51#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
52#define MCJ_CTX_RANDOM 0 /* inject context: random */ 52#define MCJ_CTX_RANDOM 0 /* inject context: random */
53#define MCJ_CTX_PROCESS 1 /* inject context: process */ 53#define MCJ_CTX_PROCESS 0x1 /* inject context: process */
54#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ 54#define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */
55#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ 55#define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */
56#define MCJ_EXCEPTION 8 /* raise as exception */ 56#define MCJ_EXCEPTION 0x8 /* raise as exception */
57#define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */
57 58
58/* Fields are zero when not available */ 59/* Fields are zero when not available */
59struct mce { 60struct mce {
@@ -120,7 +121,8 @@ struct mce_log {
120 121
121#ifdef __KERNEL__ 122#ifdef __KERNEL__
122 123
123extern struct atomic_notifier_head x86_mce_decoder_chain; 124extern void mce_register_decode_chain(struct notifier_block *nb);
125extern void mce_unregister_decode_chain(struct notifier_block *nb);
124 126
125#include <linux/percpu.h> 127#include <linux/percpu.h>
126#include <linux/init.h> 128#include <linux/init.h>
@@ -149,7 +151,7 @@ static inline void enable_p5_mce(void) {}
149 151
150void mce_setup(struct mce *m); 152void mce_setup(struct mce *m);
151void mce_log(struct mce *m); 153void mce_log(struct mce *m);
152DECLARE_PER_CPU(struct sys_device, mce_sysdev); 154extern struct device *mce_device[CONFIG_NR_CPUS];
153 155
154/* 156/*
155 * Maximum banks number. 157 * Maximum banks number.
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 24215072d0e1..4ebe157bf73d 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,7 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
48 48
49#ifdef CONFIG_MICROCODE_AMD 49#ifdef CONFIG_MICROCODE_AMD
50extern struct microcode_ops * __init init_amd_microcode(void); 50extern struct microcode_ops * __init init_amd_microcode(void);
51extern void __exit exit_amd_microcode(void);
51 52
52static inline void get_ucode_data(void *to, const u8 *from, size_t n) 53static inline void get_ucode_data(void *to, const u8 *from, size_t n)
53{ 54{
@@ -59,6 +60,7 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
59{ 60{
60 return NULL; 61 return NULL;
61} 62}
63static inline void __exit exit_amd_microcode(void) {}
62#endif 64#endif
63 65
64#endif /* _ASM_X86_MICROCODE_H */ 66#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index e6283129c821..0a0a95460434 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -31,11 +31,20 @@ enum mrst_cpu_type {
31}; 31};
32 32
33extern enum mrst_cpu_type __mrst_cpu_chip; 33extern enum mrst_cpu_type __mrst_cpu_chip;
34
35#ifdef CONFIG_X86_INTEL_MID
36
34static inline enum mrst_cpu_type mrst_identify_cpu(void) 37static inline enum mrst_cpu_type mrst_identify_cpu(void)
35{ 38{
36 return __mrst_cpu_chip; 39 return __mrst_cpu_chip;
37} 40}
38 41
42#else /* !CONFIG_X86_INTEL_MID */
43
44#define mrst_identify_cpu() (0)
45
46#endif /* !CONFIG_X86_INTEL_MID */
47
39enum mrst_timer_options { 48enum mrst_timer_options {
40 MRST_TIMER_DEFAULT, 49 MRST_TIMER_DEFAULT,
41 MRST_TIMER_APBT_ONLY, 50 MRST_TIMER_APBT_ONLY,
@@ -58,7 +67,7 @@ extern struct console early_mrst_console;
58extern void mrst_early_console_init(void); 67extern void mrst_early_console_init(void);
59 68
60extern struct console early_hsu_console; 69extern struct console early_hsu_console;
61extern void hsu_early_console_init(void); 70extern void hsu_early_console_init(const char *);
62 71
63extern void intel_scu_devices_create(void); 72extern void intel_scu_devices_create(void);
64extern void intel_scu_devices_destroy(void); 73extern void intel_scu_devices_destroy(void);
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 084ef95274cd..95203d40ffdd 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -169,7 +169,14 @@ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
169 return native_write_msr_safe(msr, low, high); 169 return native_write_msr_safe(msr, low, high);
170} 170}
171 171
172/* rdmsr with exception handling */ 172/*
173 * rdmsr with exception handling.
174 *
175 * Please note that the exception handling works only after we've
176 * switched to the "smart" #GP handler in trap_init() which knows about
177 * exception tables - using this macro earlier than that causes machine
178 * hangs on boxes which do not implement the @msr in the first argument.
179 */
173#define rdmsr_safe(msr, p1, p2) \ 180#define rdmsr_safe(msr, p1, p2) \
174({ \ 181({ \
175 int __err; \ 182 int __err; \
diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h
new file mode 100644
index 000000000000..660f843df928
--- /dev/null
+++ b/arch/x86/include/asm/numachip/numachip_csr.h
@@ -0,0 +1,167 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Numascale NumaConnect-Specific Header file
7 *
8 * Copyright (C) 2011 Numascale AS. All rights reserved.
9 *
10 * Send feedback to <support@numascale.com>
11 *
12 */
13
14#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
15#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
16
17#include <linux/numa.h>
18#include <linux/percpu.h>
19#include <linux/io.h>
20#include <linux/swab.h>
21#include <asm/types.h>
22#include <asm/processor.h>
23
24#define CSR_NODE_SHIFT 16
25#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT)
26#define CSR_NODE_MASK 0x0fff /* 4K nodes */
27
28/* 32K CSR space, b15 indicates geo/non-geo */
29#define CSR_OFFSET_MASK 0x7fffUL
30
31/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */
32#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL
33#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL
34#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1)
35
36/*
37 * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however
38 * when using the direct mapping on x86_64, both start and size needs to be
39 * aligned with PMD_SIZE which is 2M
40 */
41#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL
42#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL
43#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1)
44
45static inline void *gcsr_address(int node, unsigned long offset)
46{
47 return __va(NUMACHIP_GCSR_BASE | (1UL << 15) |
48 CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK));
49}
50
51static inline void *lcsr_address(unsigned long offset)
52{
53 return __va(NUMACHIP_LCSR_BASE | (1UL << 15) |
54 CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK));
55}
56
57static inline unsigned int read_gcsr(int node, unsigned long offset)
58{
59 return swab32(readl(gcsr_address(node, offset)));
60}
61
62static inline void write_gcsr(int node, unsigned long offset, unsigned int val)
63{
64 writel(swab32(val), gcsr_address(node, offset));
65}
66
67static inline unsigned int read_lcsr(unsigned long offset)
68{
69 return swab32(readl(lcsr_address(offset)));
70}
71
72static inline void write_lcsr(unsigned long offset, unsigned int val)
73{
74 writel(swab32(val), lcsr_address(offset));
75}
76
77/* ========================================================================= */
78/* CSR_G0_STATE_CLEAR */
79/* ========================================================================= */
80
81#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12))
82union numachip_csr_g0_state_clear {
83 unsigned int v;
84 struct numachip_csr_g0_state_clear_s {
85 unsigned int _state:2;
86 unsigned int _rsvd_2_6:5;
87 unsigned int _lost:1;
88 unsigned int _rsvd_8_31:24;
89 } s;
90};
91
92/* ========================================================================= */
93/* CSR_G0_NODE_IDS */
94/* ========================================================================= */
95
96#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
97union numachip_csr_g0_node_ids {
98 unsigned int v;
99 struct numachip_csr_g0_node_ids_s {
100 unsigned int _initialid:16;
101 unsigned int _nodeid:12;
102 unsigned int _rsvd_28_31:4;
103 } s;
104};
105
106/* ========================================================================= */
107/* CSR_G3_EXT_IRQ_GEN */
108/* ========================================================================= */
109
110#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
111union numachip_csr_g3_ext_irq_gen {
112 unsigned int v;
113 struct numachip_csr_g3_ext_irq_gen_s {
114 unsigned int _vector:8;
115 unsigned int _msgtype:3;
116 unsigned int _index:5;
117 unsigned int _destination_apic_id:16;
118 } s;
119};
120
121/* ========================================================================= */
122/* CSR_G3_EXT_IRQ_STATUS */
123/* ========================================================================= */
124
125#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12))
126union numachip_csr_g3_ext_irq_status {
127 unsigned int v;
128 struct numachip_csr_g3_ext_irq_status_s {
129 unsigned int _result:32;
130 } s;
131};
132
133/* ========================================================================= */
134/* CSR_G3_EXT_IRQ_DEST */
135/* ========================================================================= */
136
137#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12))
138union numachip_csr_g3_ext_irq_dest {
139 unsigned int v;
140 struct numachip_csr_g3_ext_irq_dest_s {
141 unsigned int _irq:8;
142 unsigned int _rsvd_8_31:24;
143 } s;
144};
145
146/* ========================================================================= */
147/* CSR_G3_NC_ATT_MAP_SELECT */
148/* ========================================================================= */
149
150#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12))
151union numachip_csr_g3_nc_att_map_select {
152 unsigned int v;
153 struct numachip_csr_g3_nc_att_map_select_s {
154 unsigned int _upper_address_bits:4;
155 unsigned int _select_ram:4;
156 unsigned int _rsvd_8_31:24;
157 } s;
158};
159
160/* ========================================================================= */
161/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */
162/* ========================================================================= */
163
164#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12))
165
166#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
167
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d498943b906c..df75d07571ce 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq)
112{ 112{
113 x86_msi.teardown_msi_irq(irq); 113 x86_msi.teardown_msi_irq(irq);
114} 114}
115static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
116{
117 x86_msi.restore_msi_irqs(dev, irq);
118}
115#define arch_setup_msi_irqs x86_setup_msi_irqs 119#define arch_setup_msi_irqs x86_setup_msi_irqs
116#define arch_teardown_msi_irqs x86_teardown_msi_irqs 120#define arch_teardown_msi_irqs x86_teardown_msi_irqs
117#define arch_teardown_msi_irq x86_teardown_msi_irq 121#define arch_teardown_msi_irq x86_teardown_msi_irq
122#define arch_restore_msi_irqs x86_restore_msi_irqs
118/* implemented in arch/x86/kernel/apic/io_apic. */ 123/* implemented in arch/x86/kernel/apic/io_apic. */
119int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); 124int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
120void native_teardown_msi_irq(unsigned int irq); 125void native_teardown_msi_irq(unsigned int irq);
126void native_restore_msi_irqs(struct pci_dev *dev, int irq);
121/* default to the implementation in drivers/lib/msi.c */ 127/* default to the implementation in drivers/lib/msi.c */
122#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS 128#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
129#define HAVE_DEFAULT_MSI_RESTORE_IRQS
123void default_teardown_msi_irqs(struct pci_dev *dev); 130void default_teardown_msi_irqs(struct pci_dev *dev);
131void default_restore_msi_irqs(struct pci_dev *dev, int irq);
124#else 132#else
125#define native_setup_msi_irqs NULL 133#define native_setup_msi_irqs NULL
126#define native_teardown_msi_irq NULL 134#define native_teardown_msi_irq NULL
127#define default_teardown_msi_irqs NULL 135#define default_teardown_msi_irqs NULL
136#define default_restore_msi_irqs NULL
128#endif 137#endif
129 138
130#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 139#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e38197806853..b3a531746026 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -44,8 +44,6 @@ enum pci_bf_sort_state {
44 44
45/* pci-i386.c */ 45/* pci-i386.c */
46 46
47extern unsigned int pcibios_max_latency;
48
49void pcibios_resource_survey(void); 47void pcibios_resource_survey(void);
50void pcibios_set_cache_line_size(void); 48void pcibios_set_cache_line_size(void);
51 49
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 3470c9d0ebba..7a11910a63c4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -414,22 +414,6 @@ do { \
414#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) 414#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
415#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) 415#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
416 416
417#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
418#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
419#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
420#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
421#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
422#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
423#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
424#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
425#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
426#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
427#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
428#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
429#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
430#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
431#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
432
433#ifndef CONFIG_M386 417#ifndef CONFIG_M386
434#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) 418#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
435#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) 419#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
@@ -445,29 +429,22 @@ do { \
445#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 429#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
446#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 430#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
447 431
448#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
449#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
450#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
451#endif /* !CONFIG_M386 */ 432#endif /* !CONFIG_M386 */
452 433
453#ifdef CONFIG_X86_CMPXCHG64 434#ifdef CONFIG_X86_CMPXCHG64
454#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ 435#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \
455({ \ 436({ \
456 char __ret; \ 437 bool __ret; \
457 typeof(o1) __o1 = o1; \ 438 typeof(pcp1) __o1 = (o1), __n1 = (n1); \
458 typeof(o1) __n1 = n1; \ 439 typeof(pcp2) __o2 = (o2), __n2 = (n2); \
459 typeof(o2) __o2 = o2; \
460 typeof(o2) __n2 = n2; \
461 typeof(o2) __dummy = n2; \
462 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ 440 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
463 : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ 441 : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \
464 : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ 442 : "b" (__n1), "c" (__n2), "a" (__o1)); \
465 __ret; \ 443 __ret; \
466}) 444})
467 445
468#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) 446#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
469#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) 447#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
470#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
471#endif /* CONFIG_X86_CMPXCHG64 */ 448#endif /* CONFIG_X86_CMPXCHG64 */
472 449
473/* 450/*
@@ -495,44 +472,28 @@ do { \
495#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) 472#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
496#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 473#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
497 474
498#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
499#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
500#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
501#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
502#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
503#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
504
505/* 475/*
506 * Pretty complex macro to generate cmpxchg16 instruction. The instruction 476 * Pretty complex macro to generate cmpxchg16 instruction. The instruction
507 * is not supported on early AMD64 processors so we must be able to emulate 477 * is not supported on early AMD64 processors so we must be able to emulate
508 * it in software. The address used in the cmpxchg16 instruction must be 478 * it in software. The address used in the cmpxchg16 instruction must be
509 * aligned to a 16 byte boundary. 479 * aligned to a 16 byte boundary.
510 */ 480 */
511#ifdef CONFIG_SMP 481#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \
512#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
513#else
514#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
515#endif
516#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
517({ \ 482({ \
518 char __ret; \ 483 bool __ret; \
519 typeof(o1) __o1 = o1; \ 484 typeof(pcp1) __o1 = (o1), __n1 = (n1); \
520 typeof(o1) __n1 = n1; \ 485 typeof(pcp2) __o2 = (o2), __n2 = (n2); \
521 typeof(o2) __o2 = o2; \ 486 alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
522 typeof(o2) __n2 = n2; \ 487 "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \
523 typeof(o2) __dummy; \
524 alternative_io(CMPXCHG16B_EMU_CALL, \
525 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
526 X86_FEATURE_CX16, \ 488 X86_FEATURE_CX16, \
527 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ 489 ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \
528 "S" (&pcp1), "b"(__n1), "c"(__n2), \ 490 "+m" (pcp2), "+d" (__o2)), \
529 "a"(__o1), "d"(__o2) : "memory"); \ 491 "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \
530 __ret; \ 492 __ret; \
531}) 493})
532 494
533#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) 495#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
534#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) 496#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
535#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
536 497
537#endif 498#endif
538 499
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f61c62f7d5d8..096c975e099f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -57,6 +57,7 @@
57 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) 57 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
58 58
59#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 59#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
60#define ARCH_PERFMON_EVENTS_COUNT 7
60 61
61/* 62/*
62 * Intel "Architectural Performance Monitoring" CPUID 63 * Intel "Architectural Performance Monitoring" CPUID
@@ -72,6 +73,19 @@ union cpuid10_eax {
72 unsigned int full; 73 unsigned int full;
73}; 74};
74 75
76union cpuid10_ebx {
77 struct {
78 unsigned int no_unhalted_core_cycles:1;
79 unsigned int no_instructions_retired:1;
80 unsigned int no_unhalted_reference_cycles:1;
81 unsigned int no_llc_reference:1;
82 unsigned int no_llc_misses:1;
83 unsigned int no_branch_instruction_retired:1;
84 unsigned int no_branch_misses_retired:1;
85 } split;
86 unsigned int full;
87};
88
75union cpuid10_edx { 89union cpuid10_edx {
76 struct { 90 struct {
77 unsigned int num_counters_fixed:5; 91 unsigned int num_counters_fixed:5;
@@ -81,6 +95,15 @@ union cpuid10_edx {
81 unsigned int full; 95 unsigned int full;
82}; 96};
83 97
98struct x86_pmu_capability {
99 int version;
100 int num_counters_gp;
101 int num_counters_fixed;
102 int bit_width_gp;
103 int bit_width_fixed;
104 unsigned int events_mask;
105 int events_mask_len;
106};
84 107
85/* 108/*
86 * Fixed-purpose performance events: 109 * Fixed-purpose performance events:
@@ -89,23 +112,24 @@ union cpuid10_edx {
89/* 112/*
90 * All 3 fixed-mode PMCs are configured via this single MSR: 113 * All 3 fixed-mode PMCs are configured via this single MSR:
91 */ 114 */
92#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d 115#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
93 116
94/* 117/*
95 * The counts are available in three separate MSRs: 118 * The counts are available in three separate MSRs:
96 */ 119 */
97 120
98/* Instr_Retired.Any: */ 121/* Instr_Retired.Any: */
99#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 122#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
100#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) 123#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
101 124
102/* CPU_CLK_Unhalted.Core: */ 125/* CPU_CLK_Unhalted.Core: */
103#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a 126#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
104#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) 127#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
105 128
106/* CPU_CLK_Unhalted.Ref: */ 129/* CPU_CLK_Unhalted.Ref: */
107#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 130#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
108#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) 131#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2)
132#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES)
109 133
110/* 134/*
111 * We model BTS tracing as another fixed-mode PMC. 135 * We model BTS tracing as another fixed-mode PMC.
@@ -202,6 +226,7 @@ struct perf_guest_switch_msr {
202}; 226};
203 227
204extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); 228extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
229extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
205#else 230#else
206static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) 231static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
207{ 232{
@@ -209,6 +234,11 @@ static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
209 return NULL; 234 return NULL;
210} 235}
211 236
237static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
238{
239 memset(cap, 0, sizeof(*cap));
240}
241
212static inline void perf_events_lapic_init(void) { } 242static inline void perf_events_lapic_init(void) { }
213#endif 243#endif
214 244
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18601c86fab1..49afb3f41eb6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -703,7 +703,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
703 pte_update(mm, addr, ptep); 703 pte_update(mm, addr, ptep);
704} 704}
705 705
706#define flush_tlb_fix_spurious_fault(vma, address) 706#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
707 707
708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) 708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
709 709
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 2dddb317bb39..f8ab3eaad128 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -6,6 +6,7 @@
6 * EFLAGS bits 6 * EFLAGS bits
7 */ 7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ 8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ 10#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ 11#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ 12#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b650435ffb53..aa9088c26931 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -99,7 +99,6 @@ struct cpuinfo_x86 {
99 u16 apicid; 99 u16 apicid;
100 u16 initial_apicid; 100 u16 initial_apicid;
101 u16 x86_clflush_size; 101 u16 x86_clflush_size;
102#ifdef CONFIG_SMP
103 /* number of cores as seen by the OS: */ 102 /* number of cores as seen by the OS: */
104 u16 booted_cores; 103 u16 booted_cores;
105 /* Physical processor id: */ 104 /* Physical processor id: */
@@ -110,7 +109,6 @@ struct cpuinfo_x86 {
110 u8 compute_unit_id; 109 u8 compute_unit_id;
111 /* Index into per_cpu list: */ 110 /* Index into per_cpu list: */
112 u16 cpu_index; 111 u16 cpu_index;
113#endif
114 u32 microcode; 112 u32 microcode;
115} __attribute__((__aligned__(SMP_CACHE_BYTES))); 113} __attribute__((__aligned__(SMP_CACHE_BYTES)));
116 114
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
new file mode 100644
index 000000000000..d3ef63fe0c81
--- /dev/null
+++ b/arch/x86/include/asm/serpent.h
@@ -0,0 +1,63 @@
1#ifndef ASM_X86_SERPENT_H
2#define ASM_X86_SERPENT_H
3
4#include <linux/crypto.h>
5#include <crypto/serpent.h>
6
7#ifdef CONFIG_X86_32
8
9#define SERPENT_PARALLEL_BLOCKS 4
10
11asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src, bool xor);
13asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst,
14 const u8 *src);
15
16static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
17 const u8 *src)
18{
19 __serpent_enc_blk_4way(ctx, dst, src, false);
20}
21
22static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
23 const u8 *src)
24{
25 __serpent_enc_blk_4way(ctx, dst, src, true);
26}
27
28static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
29 const u8 *src)
30{
31 serpent_dec_blk_4way(ctx, dst, src);
32}
33
34#else
35
36#define SERPENT_PARALLEL_BLOCKS 8
37
38asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
39 const u8 *src, bool xor);
40asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
41 const u8 *src);
42
43static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
44 const u8 *src)
45{
46 __serpent_enc_blk_8way(ctx, dst, src, false);
47}
48
49static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
50 const u8 *src)
51{
52 __serpent_enc_blk_8way(ctx, dst, src, true);
53}
54
55static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
56 const u8 *src)
57{
58 serpent_dec_blk_8way(ctx, dst, src);
59}
60
61#endif
62
63#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9756551ec760..d0f19f9fb846 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void);
47extern void i386_reserve_resources(void); 47extern void i386_reserve_resources(void);
48extern void setup_default_timer_irq(void); 48extern void setup_default_timer_irq(void);
49 49
50#ifdef CONFIG_X86_MRST 50#ifdef CONFIG_X86_INTEL_MID
51extern void x86_mrst_early_setup(void); 51extern void x86_mrst_early_setup(void);
52#else 52#else
53static inline void x86_mrst_early_setup(void) { } 53static inline void x86_mrst_early_setup(void) { }
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 73b11bc0ae6f..0434c400287c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void);
225 225
226#endif /* CONFIG_X86_LOCAL_APIC */ 226#endif /* CONFIG_X86_LOCAL_APIC */
227 227
228#ifdef CONFIG_DEBUG_NMI_SELFTEST
229extern void nmi_selftest(void);
230#else
231#define nmi_selftest() do { } while (0)
232#endif
233
228#endif /* __ASSEMBLY__ */ 234#endif /* __ASSEMBLY__ */
229#endif /* _ASM_X86_SMP_H */ 235#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 972c260919a3..a82c2bf504b6 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -79,23 +79,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
79 return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; 79 return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
80} 80}
81 81
82#if (NR_CPUS < 256)
83static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) 82static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
84{ 83{
85 asm volatile(UNLOCK_LOCK_PREFIX "incb %0" 84 __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
86 : "+m" (lock->head_tail)
87 :
88 : "memory", "cc");
89} 85}
90#else
91static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
92{
93 asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
94 : "+m" (lock->head_tail)
95 :
96 : "memory", "cc");
97}
98#endif
99 86
100static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) 87static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
101{ 88{
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c4a348f7bd43..d962e5652a73 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/err.h> 17#include <linux/err.h>
18#include <asm/asm-offsets.h> /* For NR_syscalls */
18 19
19extern const unsigned long sys_call_table[]; 20extern const unsigned long sys_call_table[];
20 21
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index c2ff2a1d845e..2d2f01ce6dcb 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -401,6 +401,7 @@ extern unsigned long arch_align_stack(unsigned long sp);
401extern void free_init_pages(char *what, unsigned long begin, unsigned long end); 401extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
402 402
403void default_idle(void); 403void default_idle(void);
404bool set_pm_idle_to_default(void);
404 405
405void stop_this_cpu(void *dummy); 406void stop_this_cpu(void *dummy);
406 407
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c127b52..bc817cd8b443 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,7 +40,8 @@ struct thread_info {
40 */ 40 */
41 __u8 supervisor_stack[0]; 41 __u8 supervisor_stack[0];
42#endif 42#endif
43 int uaccess_err; 43 unsigned int sig_on_uaccess_error:1;
44 unsigned int uaccess_err:1; /* uaccess failed */
44}; 45};
45 46
46#define INIT_THREAD_INFO(tsk) \ 47#define INIT_THREAD_INFO(tsk) \
@@ -90,7 +91,6 @@ struct thread_info {
90#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ 91#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
91#define TIF_DEBUG 21 /* uses debug registers */ 92#define TIF_DEBUG 21 /* uses debug registers */
92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 93#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
93#define TIF_FREEZE 23 /* is freezing for suspend */
94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
95#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ 95#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
96#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ 96#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
@@ -112,7 +112,6 @@ struct thread_info {
112#define _TIF_FORK (1 << TIF_FORK) 112#define _TIF_FORK (1 << TIF_FORK)
113#define _TIF_DEBUG (1 << TIF_DEBUG) 113#define _TIF_DEBUG (1 << TIF_DEBUG)
114#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 114#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
115#define _TIF_FREEZE (1 << TIF_FREEZE)
116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 115#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
117#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) 116#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
118#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) 117#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
@@ -231,6 +230,12 @@ static inline struct thread_info *current_thread_info(void)
231 movq PER_CPU_VAR(kernel_stack),reg ; \ 230 movq PER_CPU_VAR(kernel_stack),reg ; \
232 subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg 231 subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
233 232
233/*
234 * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
235 * a certain register (to be used in assembler memory operands).
236 */
237#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
238
234#endif 239#endif
235 240
236#endif /* !X86_32 */ 241#endif /* !X86_32 */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fa7b9176b76c..431793e5d484 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -32,6 +32,22 @@ extern int no_timer_check;
32 * (mathieu.desnoyers@polymtl.ca) 32 * (mathieu.desnoyers@polymtl.ca)
33 * 33 *
34 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 34 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
35 *
36 * In:
37 *
38 * ns = cycles * cyc2ns_scale / SC
39 *
40 * Although we may still have enough bits to store the value of ns,
41 * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
42 * leading to an incorrect result.
43 *
44 * To avoid this, we can decompose 'cycles' into quotient and remainder
45 * of division by SC. Then,
46 *
47 * ns = (quot * SC + rem) * cyc2ns_scale / SC
48 * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
49 *
50 * - sqazi@google.com
35 */ 51 */
36 52
37DECLARE_PER_CPU(unsigned long, cyc2ns); 53DECLARE_PER_CPU(unsigned long, cyc2ns);
@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
41 57
42static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 58static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
43{ 59{
60 unsigned long long quot;
61 unsigned long long rem;
44 int cpu = smp_processor_id(); 62 int cpu = smp_processor_id();
45 unsigned long long ns = per_cpu(cyc2ns_offset, cpu); 63 unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
46 ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; 64 quot = (cyc >> CYC2NS_SCALE_FACTOR);
65 rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
66 ns += quot * per_cpu(cyc2ns, cpu) +
67 ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
47 return ns; 68 return ns;
48} 69}
49 70
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c00692476e9f..b9676ae37ada 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -130,10 +130,8 @@ extern void setup_node_to_cpumask_map(void);
130 .balance_interval = 1, \ 130 .balance_interval = 1, \
131} 131}
132 132
133#ifdef CONFIG_X86_64
134extern int __node_distance(int, int); 133extern int __node_distance(int, int);
135#define node_distance(a, b) __node_distance(a, b) 134#define node_distance(a, b) __node_distance(a, b)
136#endif
137 135
138#else /* !CONFIG_NUMA */ 136#else /* !CONFIG_NUMA */
139 137
@@ -174,7 +172,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
174} 172}
175 173
176struct pci_bus; 174struct pci_bus;
177void x86_pci_root_bus_res_quirks(struct pci_bus *b); 175void x86_pci_root_bus_resources(int bus, struct list_head *resources);
178 176
179#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
180#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ 178#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 83e2efd181e2..15d99153a96d 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,8 @@ extern int unsynchronized_tsc(void);
51extern int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void); 52extern unsigned long native_calibrate_tsc(void);
53 53
54extern int tsc_clocksource_reliable;
55
54/* 56/*
55 * Boot-time check whether the TSCs are synchronized across 57 * Boot-time check whether the TSCs are synchronized across
56 * all CPUs/cores: 58 * all CPUs/cores:
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 36361bf6fdd1..8be5f54d9360 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; };
462 barrier(); 462 barrier();
463 463
464#define uaccess_catch(err) \ 464#define uaccess_catch(err) \
465 (err) |= current_thread_info()->uaccess_err; \ 465 (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \
466 current_thread_info()->uaccess_err = prev_err; \ 466 current_thread_info()->uaccess_err = prev_err; \
467} while (0) 467} while (0)
468 468
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 2a58ed3e51d8..21f77b89e47a 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -1,13 +1,60 @@
1#ifndef _ASM_X86_UNISTD_H
2#define _ASM_X86_UNISTD_H 1
3
1#ifdef __KERNEL__ 4#ifdef __KERNEL__
2# ifdef CONFIG_X86_32 5# ifdef CONFIG_X86_32
3# include "unistd_32.h" 6
7# include <asm/unistd_32.h>
8# define __ARCH_WANT_IPC_PARSE_VERSION
9# define __ARCH_WANT_STAT64
10# define __ARCH_WANT_SYS_IPC
11# define __ARCH_WANT_SYS_OLD_MMAP
12# define __ARCH_WANT_SYS_OLD_SELECT
13
4# else 14# else
5# include "unistd_64.h" 15
16# include <asm/unistd_64.h>
17# define __ARCH_WANT_COMPAT_SYS_TIME
18
6# endif 19# endif
20
21# define __ARCH_WANT_OLD_READDIR
22# define __ARCH_WANT_OLD_STAT
23# define __ARCH_WANT_SYS_ALARM
24# define __ARCH_WANT_SYS_FADVISE64
25# define __ARCH_WANT_SYS_GETHOSTNAME
26# define __ARCH_WANT_SYS_GETPGRP
27# define __ARCH_WANT_SYS_LLSEEK
28# define __ARCH_WANT_SYS_NICE
29# define __ARCH_WANT_SYS_OLDUMOUNT
30# define __ARCH_WANT_SYS_OLD_GETRLIMIT
31# define __ARCH_WANT_SYS_OLD_UNAME
32# define __ARCH_WANT_SYS_PAUSE
33# define __ARCH_WANT_SYS_RT_SIGACTION
34# define __ARCH_WANT_SYS_RT_SIGSUSPEND
35# define __ARCH_WANT_SYS_SGETMASK
36# define __ARCH_WANT_SYS_SIGNAL
37# define __ARCH_WANT_SYS_SIGPENDING
38# define __ARCH_WANT_SYS_SIGPROCMASK
39# define __ARCH_WANT_SYS_SOCKETCALL
40# define __ARCH_WANT_SYS_TIME
41# define __ARCH_WANT_SYS_UTIME
42# define __ARCH_WANT_SYS_WAITPID
43
44/*
45 * "Conditional" syscalls
46 *
47 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
48 * but it doesn't work on all toolchains, so we just do it by hand
49 */
50# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
51
7#else 52#else
8# ifdef __i386__ 53# ifdef __i386__
9# include "unistd_32.h" 54# include <asm/unistd_32.h>
10# else 55# else
11# include "unistd_64.h" 56# include <asm/unistd_64.h>
12# endif 57# endif
13#endif 58#endif
59
60#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
deleted file mode 100644
index 599c77d38f33..000000000000
--- a/arch/x86/include/asm/unistd_32.h
+++ /dev/null
@@ -1,401 +0,0 @@
1#ifndef _ASM_X86_UNISTD_32_H
2#define _ASM_X86_UNISTD_32_H
3
4/*
5 * This file contains the system call numbers.
6 */
7
8#define __NR_restart_syscall 0
9#define __NR_exit 1
10#define __NR_fork 2
11#define __NR_read 3
12#define __NR_write 4
13#define __NR_open 5
14#define __NR_close 6
15#define __NR_waitpid 7
16#define __NR_creat 8
17#define __NR_link 9
18#define __NR_unlink 10
19#define __NR_execve 11
20#define __NR_chdir 12
21#define __NR_time 13
22#define __NR_mknod 14
23#define __NR_chmod 15
24#define __NR_lchown 16
25#define __NR_break 17
26#define __NR_oldstat 18
27#define __NR_lseek 19
28#define __NR_getpid 20
29#define __NR_mount 21
30#define __NR_umount 22
31#define __NR_setuid 23
32#define __NR_getuid 24
33#define __NR_stime 25
34#define __NR_ptrace 26
35#define __NR_alarm 27
36#define __NR_oldfstat 28
37#define __NR_pause 29
38#define __NR_utime 30
39#define __NR_stty 31
40#define __NR_gtty 32
41#define __NR_access 33
42#define __NR_nice 34
43#define __NR_ftime 35
44#define __NR_sync 36
45#define __NR_kill 37
46#define __NR_rename 38
47#define __NR_mkdir 39
48#define __NR_rmdir 40
49#define __NR_dup 41
50#define __NR_pipe 42
51#define __NR_times 43
52#define __NR_prof 44
53#define __NR_brk 45
54#define __NR_setgid 46
55#define __NR_getgid 47
56#define __NR_signal 48
57#define __NR_geteuid 49
58#define __NR_getegid 50
59#define __NR_acct 51
60#define __NR_umount2 52
61#define __NR_lock 53
62#define __NR_ioctl 54
63#define __NR_fcntl 55
64#define __NR_mpx 56
65#define __NR_setpgid 57
66#define __NR_ulimit 58
67#define __NR_oldolduname 59
68#define __NR_umask 60
69#define __NR_chroot 61
70#define __NR_ustat 62
71#define __NR_dup2 63
72#define __NR_getppid 64
73#define __NR_getpgrp 65
74#define __NR_setsid 66
75#define __NR_sigaction 67
76#define __NR_sgetmask 68
77#define __NR_ssetmask 69
78#define __NR_setreuid 70
79#define __NR_setregid 71
80#define __NR_sigsuspend 72
81#define __NR_sigpending 73
82#define __NR_sethostname 74
83#define __NR_setrlimit 75
84#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */
85#define __NR_getrusage 77
86#define __NR_gettimeofday 78
87#define __NR_settimeofday 79
88#define __NR_getgroups 80
89#define __NR_setgroups 81
90#define __NR_select 82
91#define __NR_symlink 83
92#define __NR_oldlstat 84
93#define __NR_readlink 85
94#define __NR_uselib 86
95#define __NR_swapon 87
96#define __NR_reboot 88
97#define __NR_readdir 89
98#define __NR_mmap 90
99#define __NR_munmap 91
100#define __NR_truncate 92
101#define __NR_ftruncate 93
102#define __NR_fchmod 94
103#define __NR_fchown 95
104#define __NR_getpriority 96
105#define __NR_setpriority 97
106#define __NR_profil 98
107#define __NR_statfs 99
108#define __NR_fstatfs 100
109#define __NR_ioperm 101
110#define __NR_socketcall 102
111#define __NR_syslog 103
112#define __NR_setitimer 104
113#define __NR_getitimer 105
114#define __NR_stat 106
115#define __NR_lstat 107
116#define __NR_fstat 108
117#define __NR_olduname 109
118#define __NR_iopl 110
119#define __NR_vhangup 111
120#define __NR_idle 112
121#define __NR_vm86old 113
122#define __NR_wait4 114
123#define __NR_swapoff 115
124#define __NR_sysinfo 116
125#define __NR_ipc 117
126#define __NR_fsync 118
127#define __NR_sigreturn 119
128#define __NR_clone 120
129#define __NR_setdomainname 121
130#define __NR_uname 122
131#define __NR_modify_ldt 123
132#define __NR_adjtimex 124
133#define __NR_mprotect 125
134#define __NR_sigprocmask 126
135#define __NR_create_module 127
136#define __NR_init_module 128
137#define __NR_delete_module 129
138#define __NR_get_kernel_syms 130
139#define __NR_quotactl 131
140#define __NR_getpgid 132
141#define __NR_fchdir 133
142#define __NR_bdflush 134
143#define __NR_sysfs 135
144#define __NR_personality 136
145#define __NR_afs_syscall 137 /* Syscall for Andrew File System */
146#define __NR_setfsuid 138
147#define __NR_setfsgid 139
148#define __NR__llseek 140
149#define __NR_getdents 141
150#define __NR__newselect 142
151#define __NR_flock 143
152#define __NR_msync 144
153#define __NR_readv 145
154#define __NR_writev 146
155#define __NR_getsid 147
156#define __NR_fdatasync 148
157#define __NR__sysctl 149
158#define __NR_mlock 150
159#define __NR_munlock 151
160#define __NR_mlockall 152
161#define __NR_munlockall 153
162#define __NR_sched_setparam 154
163#define __NR_sched_getparam 155
164#define __NR_sched_setscheduler 156
165#define __NR_sched_getscheduler 157
166#define __NR_sched_yield 158
167#define __NR_sched_get_priority_max 159
168#define __NR_sched_get_priority_min 160
169#define __NR_sched_rr_get_interval 161
170#define __NR_nanosleep 162
171#define __NR_mremap 163
172#define __NR_setresuid 164
173#define __NR_getresuid 165
174#define __NR_vm86 166
175#define __NR_query_module 167
176#define __NR_poll 168
177#define __NR_nfsservctl 169
178#define __NR_setresgid 170
179#define __NR_getresgid 171
180#define __NR_prctl 172
181#define __NR_rt_sigreturn 173
182#define __NR_rt_sigaction 174
183#define __NR_rt_sigprocmask 175
184#define __NR_rt_sigpending 176
185#define __NR_rt_sigtimedwait 177
186#define __NR_rt_sigqueueinfo 178
187#define __NR_rt_sigsuspend 179
188#define __NR_pread64 180
189#define __NR_pwrite64 181
190#define __NR_chown 182
191#define __NR_getcwd 183
192#define __NR_capget 184
193#define __NR_capset 185
194#define __NR_sigaltstack 186
195#define __NR_sendfile 187
196#define __NR_getpmsg 188 /* some people actually want streams */
197#define __NR_putpmsg 189 /* some people actually want streams */
198#define __NR_vfork 190
199#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */
200#define __NR_mmap2 192
201#define __NR_truncate64 193
202#define __NR_ftruncate64 194
203#define __NR_stat64 195
204#define __NR_lstat64 196
205#define __NR_fstat64 197
206#define __NR_lchown32 198
207#define __NR_getuid32 199
208#define __NR_getgid32 200
209#define __NR_geteuid32 201
210#define __NR_getegid32 202
211#define __NR_setreuid32 203
212#define __NR_setregid32 204
213#define __NR_getgroups32 205
214#define __NR_setgroups32 206
215#define __NR_fchown32 207
216#define __NR_setresuid32 208
217#define __NR_getresuid32 209
218#define __NR_setresgid32 210
219#define __NR_getresgid32 211
220#define __NR_chown32 212
221#define __NR_setuid32 213
222#define __NR_setgid32 214
223#define __NR_setfsuid32 215
224#define __NR_setfsgid32 216
225#define __NR_pivot_root 217
226#define __NR_mincore 218
227#define __NR_madvise 219
228#define __NR_madvise1 219 /* delete when C lib stub is removed */
229#define __NR_getdents64 220
230#define __NR_fcntl64 221
231/* 223 is unused */
232#define __NR_gettid 224
233#define __NR_readahead 225
234#define __NR_setxattr 226
235#define __NR_lsetxattr 227
236#define __NR_fsetxattr 228
237#define __NR_getxattr 229
238#define __NR_lgetxattr 230
239#define __NR_fgetxattr 231
240#define __NR_listxattr 232
241#define __NR_llistxattr 233
242#define __NR_flistxattr 234
243#define __NR_removexattr 235
244#define __NR_lremovexattr 236
245#define __NR_fremovexattr 237
246#define __NR_tkill 238
247#define __NR_sendfile64 239
248#define __NR_futex 240
249#define __NR_sched_setaffinity 241
250#define __NR_sched_getaffinity 242
251#define __NR_set_thread_area 243
252#define __NR_get_thread_area 244
253#define __NR_io_setup 245
254#define __NR_io_destroy 246
255#define __NR_io_getevents 247
256#define __NR_io_submit 248
257#define __NR_io_cancel 249
258#define __NR_fadvise64 250
259/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
260#define __NR_exit_group 252
261#define __NR_lookup_dcookie 253
262#define __NR_epoll_create 254
263#define __NR_epoll_ctl 255
264#define __NR_epoll_wait 256
265#define __NR_remap_file_pages 257
266#define __NR_set_tid_address 258
267#define __NR_timer_create 259
268#define __NR_timer_settime (__NR_timer_create+1)
269#define __NR_timer_gettime (__NR_timer_create+2)
270#define __NR_timer_getoverrun (__NR_timer_create+3)
271#define __NR_timer_delete (__NR_timer_create+4)
272#define __NR_clock_settime (__NR_timer_create+5)
273#define __NR_clock_gettime (__NR_timer_create+6)
274#define __NR_clock_getres (__NR_timer_create+7)
275#define __NR_clock_nanosleep (__NR_timer_create+8)
276#define __NR_statfs64 268
277#define __NR_fstatfs64 269
278#define __NR_tgkill 270
279#define __NR_utimes 271
280#define __NR_fadvise64_64 272
281#define __NR_vserver 273
282#define __NR_mbind 274
283#define __NR_get_mempolicy 275
284#define __NR_set_mempolicy 276
285#define __NR_mq_open 277
286#define __NR_mq_unlink (__NR_mq_open+1)
287#define __NR_mq_timedsend (__NR_mq_open+2)
288#define __NR_mq_timedreceive (__NR_mq_open+3)
289#define __NR_mq_notify (__NR_mq_open+4)
290#define __NR_mq_getsetattr (__NR_mq_open+5)
291#define __NR_kexec_load 283
292#define __NR_waitid 284
293/* #define __NR_sys_setaltroot 285 */
294#define __NR_add_key 286
295#define __NR_request_key 287
296#define __NR_keyctl 288
297#define __NR_ioprio_set 289
298#define __NR_ioprio_get 290
299#define __NR_inotify_init 291
300#define __NR_inotify_add_watch 292
301#define __NR_inotify_rm_watch 293
302#define __NR_migrate_pages 294
303#define __NR_openat 295
304#define __NR_mkdirat 296
305#define __NR_mknodat 297
306#define __NR_fchownat 298
307#define __NR_futimesat 299
308#define __NR_fstatat64 300
309#define __NR_unlinkat 301
310#define __NR_renameat 302
311#define __NR_linkat 303
312#define __NR_symlinkat 304
313#define __NR_readlinkat 305
314#define __NR_fchmodat 306
315#define __NR_faccessat 307
316#define __NR_pselect6 308
317#define __NR_ppoll 309
318#define __NR_unshare 310
319#define __NR_set_robust_list 311
320#define __NR_get_robust_list 312
321#define __NR_splice 313
322#define __NR_sync_file_range 314
323#define __NR_tee 315
324#define __NR_vmsplice 316
325#define __NR_move_pages 317
326#define __NR_getcpu 318
327#define __NR_epoll_pwait 319
328#define __NR_utimensat 320
329#define __NR_signalfd 321
330#define __NR_timerfd_create 322
331#define __NR_eventfd 323
332#define __NR_fallocate 324
333#define __NR_timerfd_settime 325
334#define __NR_timerfd_gettime 326
335#define __NR_signalfd4 327
336#define __NR_eventfd2 328
337#define __NR_epoll_create1 329
338#define __NR_dup3 330
339#define __NR_pipe2 331
340#define __NR_inotify_init1 332
341#define __NR_preadv 333
342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337
346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340
349#define __NR_name_to_handle_at 341
350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344
353#define __NR_sendmmsg 345
354#define __NR_setns 346
355#define __NR_process_vm_readv 347
356#define __NR_process_vm_writev 348
357
358#ifdef __KERNEL__
359
360#define NR_syscalls 349
361
362#define __ARCH_WANT_IPC_PARSE_VERSION
363#define __ARCH_WANT_OLD_READDIR
364#define __ARCH_WANT_OLD_STAT
365#define __ARCH_WANT_STAT64
366#define __ARCH_WANT_SYS_ALARM
367#define __ARCH_WANT_SYS_GETHOSTNAME
368#define __ARCH_WANT_SYS_IPC
369#define __ARCH_WANT_SYS_PAUSE
370#define __ARCH_WANT_SYS_SGETMASK
371#define __ARCH_WANT_SYS_SIGNAL
372#define __ARCH_WANT_SYS_TIME
373#define __ARCH_WANT_SYS_UTIME
374#define __ARCH_WANT_SYS_WAITPID
375#define __ARCH_WANT_SYS_SOCKETCALL
376#define __ARCH_WANT_SYS_FADVISE64
377#define __ARCH_WANT_SYS_GETPGRP
378#define __ARCH_WANT_SYS_LLSEEK
379#define __ARCH_WANT_SYS_NICE
380#define __ARCH_WANT_SYS_OLD_GETRLIMIT
381#define __ARCH_WANT_SYS_OLD_UNAME
382#define __ARCH_WANT_SYS_OLD_MMAP
383#define __ARCH_WANT_SYS_OLD_SELECT
384#define __ARCH_WANT_SYS_OLDUMOUNT
385#define __ARCH_WANT_SYS_SIGPENDING
386#define __ARCH_WANT_SYS_SIGPROCMASK
387#define __ARCH_WANT_SYS_RT_SIGACTION
388#define __ARCH_WANT_SYS_RT_SIGSUSPEND
389
390/*
391 * "Conditional" syscalls
392 *
393 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
394 * but it doesn't work on all toolchains, so we just do it by hand
395 */
396#ifndef cond_syscall
397#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
398#endif
399
400#endif /* __KERNEL__ */
401#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
deleted file mode 100644
index 0431f193c3f2..000000000000
--- a/arch/x86/include/asm/unistd_64.h
+++ /dev/null
@@ -1,732 +0,0 @@
1#ifndef _ASM_X86_UNISTD_64_H
2#define _ASM_X86_UNISTD_64_H
3
4#ifndef __SYSCALL
5#define __SYSCALL(a, b)
6#endif
7
8/*
9 * This file contains the system call numbers.
10 *
11 * Note: holes are not allowed.
12 */
13
14/* at least 8 syscall per cacheline */
15#define __NR_read 0
16__SYSCALL(__NR_read, sys_read)
17#define __NR_write 1
18__SYSCALL(__NR_write, sys_write)
19#define __NR_open 2
20__SYSCALL(__NR_open, sys_open)
21#define __NR_close 3
22__SYSCALL(__NR_close, sys_close)
23#define __NR_stat 4
24__SYSCALL(__NR_stat, sys_newstat)
25#define __NR_fstat 5
26__SYSCALL(__NR_fstat, sys_newfstat)
27#define __NR_lstat 6
28__SYSCALL(__NR_lstat, sys_newlstat)
29#define __NR_poll 7
30__SYSCALL(__NR_poll, sys_poll)
31
32#define __NR_lseek 8
33__SYSCALL(__NR_lseek, sys_lseek)
34#define __NR_mmap 9
35__SYSCALL(__NR_mmap, sys_mmap)
36#define __NR_mprotect 10
37__SYSCALL(__NR_mprotect, sys_mprotect)
38#define __NR_munmap 11
39__SYSCALL(__NR_munmap, sys_munmap)
40#define __NR_brk 12
41__SYSCALL(__NR_brk, sys_brk)
42#define __NR_rt_sigaction 13
43__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
44#define __NR_rt_sigprocmask 14
45__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
46#define __NR_rt_sigreturn 15
47__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
48
49#define __NR_ioctl 16
50__SYSCALL(__NR_ioctl, sys_ioctl)
51#define __NR_pread64 17
52__SYSCALL(__NR_pread64, sys_pread64)
53#define __NR_pwrite64 18
54__SYSCALL(__NR_pwrite64, sys_pwrite64)
55#define __NR_readv 19
56__SYSCALL(__NR_readv, sys_readv)
57#define __NR_writev 20
58__SYSCALL(__NR_writev, sys_writev)
59#define __NR_access 21
60__SYSCALL(__NR_access, sys_access)
61#define __NR_pipe 22
62__SYSCALL(__NR_pipe, sys_pipe)
63#define __NR_select 23
64__SYSCALL(__NR_select, sys_select)
65
66#define __NR_sched_yield 24
67__SYSCALL(__NR_sched_yield, sys_sched_yield)
68#define __NR_mremap 25
69__SYSCALL(__NR_mremap, sys_mremap)
70#define __NR_msync 26
71__SYSCALL(__NR_msync, sys_msync)
72#define __NR_mincore 27
73__SYSCALL(__NR_mincore, sys_mincore)
74#define __NR_madvise 28
75__SYSCALL(__NR_madvise, sys_madvise)
76#define __NR_shmget 29
77__SYSCALL(__NR_shmget, sys_shmget)
78#define __NR_shmat 30
79__SYSCALL(__NR_shmat, sys_shmat)
80#define __NR_shmctl 31
81__SYSCALL(__NR_shmctl, sys_shmctl)
82
83#define __NR_dup 32
84__SYSCALL(__NR_dup, sys_dup)
85#define __NR_dup2 33
86__SYSCALL(__NR_dup2, sys_dup2)
87#define __NR_pause 34
88__SYSCALL(__NR_pause, sys_pause)
89#define __NR_nanosleep 35
90__SYSCALL(__NR_nanosleep, sys_nanosleep)
91#define __NR_getitimer 36
92__SYSCALL(__NR_getitimer, sys_getitimer)
93#define __NR_alarm 37
94__SYSCALL(__NR_alarm, sys_alarm)
95#define __NR_setitimer 38
96__SYSCALL(__NR_setitimer, sys_setitimer)
97#define __NR_getpid 39
98__SYSCALL(__NR_getpid, sys_getpid)
99
100#define __NR_sendfile 40
101__SYSCALL(__NR_sendfile, sys_sendfile64)
102#define __NR_socket 41
103__SYSCALL(__NR_socket, sys_socket)
104#define __NR_connect 42
105__SYSCALL(__NR_connect, sys_connect)
106#define __NR_accept 43
107__SYSCALL(__NR_accept, sys_accept)
108#define __NR_sendto 44
109__SYSCALL(__NR_sendto, sys_sendto)
110#define __NR_recvfrom 45
111__SYSCALL(__NR_recvfrom, sys_recvfrom)
112#define __NR_sendmsg 46
113__SYSCALL(__NR_sendmsg, sys_sendmsg)
114#define __NR_recvmsg 47
115__SYSCALL(__NR_recvmsg, sys_recvmsg)
116
117#define __NR_shutdown 48
118__SYSCALL(__NR_shutdown, sys_shutdown)
119#define __NR_bind 49
120__SYSCALL(__NR_bind, sys_bind)
121#define __NR_listen 50
122__SYSCALL(__NR_listen, sys_listen)
123#define __NR_getsockname 51
124__SYSCALL(__NR_getsockname, sys_getsockname)
125#define __NR_getpeername 52
126__SYSCALL(__NR_getpeername, sys_getpeername)
127#define __NR_socketpair 53
128__SYSCALL(__NR_socketpair, sys_socketpair)
129#define __NR_setsockopt 54
130__SYSCALL(__NR_setsockopt, sys_setsockopt)
131#define __NR_getsockopt 55
132__SYSCALL(__NR_getsockopt, sys_getsockopt)
133
134#define __NR_clone 56
135__SYSCALL(__NR_clone, stub_clone)
136#define __NR_fork 57
137__SYSCALL(__NR_fork, stub_fork)
138#define __NR_vfork 58
139__SYSCALL(__NR_vfork, stub_vfork)
140#define __NR_execve 59
141__SYSCALL(__NR_execve, stub_execve)
142#define __NR_exit 60
143__SYSCALL(__NR_exit, sys_exit)
144#define __NR_wait4 61
145__SYSCALL(__NR_wait4, sys_wait4)
146#define __NR_kill 62
147__SYSCALL(__NR_kill, sys_kill)
148#define __NR_uname 63
149__SYSCALL(__NR_uname, sys_newuname)
150
151#define __NR_semget 64
152__SYSCALL(__NR_semget, sys_semget)
153#define __NR_semop 65
154__SYSCALL(__NR_semop, sys_semop)
155#define __NR_semctl 66
156__SYSCALL(__NR_semctl, sys_semctl)
157#define __NR_shmdt 67
158__SYSCALL(__NR_shmdt, sys_shmdt)
159#define __NR_msgget 68
160__SYSCALL(__NR_msgget, sys_msgget)
161#define __NR_msgsnd 69
162__SYSCALL(__NR_msgsnd, sys_msgsnd)
163#define __NR_msgrcv 70
164__SYSCALL(__NR_msgrcv, sys_msgrcv)
165#define __NR_msgctl 71
166__SYSCALL(__NR_msgctl, sys_msgctl)
167
168#define __NR_fcntl 72
169__SYSCALL(__NR_fcntl, sys_fcntl)
170#define __NR_flock 73
171__SYSCALL(__NR_flock, sys_flock)
172#define __NR_fsync 74
173__SYSCALL(__NR_fsync, sys_fsync)
174#define __NR_fdatasync 75
175__SYSCALL(__NR_fdatasync, sys_fdatasync)
176#define __NR_truncate 76
177__SYSCALL(__NR_truncate, sys_truncate)
178#define __NR_ftruncate 77
179__SYSCALL(__NR_ftruncate, sys_ftruncate)
180#define __NR_getdents 78
181__SYSCALL(__NR_getdents, sys_getdents)
182#define __NR_getcwd 79
183__SYSCALL(__NR_getcwd, sys_getcwd)
184
185#define __NR_chdir 80
186__SYSCALL(__NR_chdir, sys_chdir)
187#define __NR_fchdir 81
188__SYSCALL(__NR_fchdir, sys_fchdir)
189#define __NR_rename 82
190__SYSCALL(__NR_rename, sys_rename)
191#define __NR_mkdir 83
192__SYSCALL(__NR_mkdir, sys_mkdir)
193#define __NR_rmdir 84
194__SYSCALL(__NR_rmdir, sys_rmdir)
195#define __NR_creat 85
196__SYSCALL(__NR_creat, sys_creat)
197#define __NR_link 86
198__SYSCALL(__NR_link, sys_link)
199#define __NR_unlink 87
200__SYSCALL(__NR_unlink, sys_unlink)
201
202#define __NR_symlink 88
203__SYSCALL(__NR_symlink, sys_symlink)
204#define __NR_readlink 89
205__SYSCALL(__NR_readlink, sys_readlink)
206#define __NR_chmod 90
207__SYSCALL(__NR_chmod, sys_chmod)
208#define __NR_fchmod 91
209__SYSCALL(__NR_fchmod, sys_fchmod)
210#define __NR_chown 92
211__SYSCALL(__NR_chown, sys_chown)
212#define __NR_fchown 93
213__SYSCALL(__NR_fchown, sys_fchown)
214#define __NR_lchown 94
215__SYSCALL(__NR_lchown, sys_lchown)
216#define __NR_umask 95
217__SYSCALL(__NR_umask, sys_umask)
218
219#define __NR_gettimeofday 96
220__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
221#define __NR_getrlimit 97
222__SYSCALL(__NR_getrlimit, sys_getrlimit)
223#define __NR_getrusage 98
224__SYSCALL(__NR_getrusage, sys_getrusage)
225#define __NR_sysinfo 99
226__SYSCALL(__NR_sysinfo, sys_sysinfo)
227#define __NR_times 100
228__SYSCALL(__NR_times, sys_times)
229#define __NR_ptrace 101
230__SYSCALL(__NR_ptrace, sys_ptrace)
231#define __NR_getuid 102
232__SYSCALL(__NR_getuid, sys_getuid)
233#define __NR_syslog 103
234__SYSCALL(__NR_syslog, sys_syslog)
235
236/* at the very end the stuff that never runs during the benchmarks */
237#define __NR_getgid 104
238__SYSCALL(__NR_getgid, sys_getgid)
239#define __NR_setuid 105
240__SYSCALL(__NR_setuid, sys_setuid)
241#define __NR_setgid 106
242__SYSCALL(__NR_setgid, sys_setgid)
243#define __NR_geteuid 107
244__SYSCALL(__NR_geteuid, sys_geteuid)
245#define __NR_getegid 108
246__SYSCALL(__NR_getegid, sys_getegid)
247#define __NR_setpgid 109
248__SYSCALL(__NR_setpgid, sys_setpgid)
249#define __NR_getppid 110
250__SYSCALL(__NR_getppid, sys_getppid)
251#define __NR_getpgrp 111
252__SYSCALL(__NR_getpgrp, sys_getpgrp)
253
254#define __NR_setsid 112
255__SYSCALL(__NR_setsid, sys_setsid)
256#define __NR_setreuid 113
257__SYSCALL(__NR_setreuid, sys_setreuid)
258#define __NR_setregid 114
259__SYSCALL(__NR_setregid, sys_setregid)
260#define __NR_getgroups 115
261__SYSCALL(__NR_getgroups, sys_getgroups)
262#define __NR_setgroups 116
263__SYSCALL(__NR_setgroups, sys_setgroups)
264#define __NR_setresuid 117
265__SYSCALL(__NR_setresuid, sys_setresuid)
266#define __NR_getresuid 118
267__SYSCALL(__NR_getresuid, sys_getresuid)
268#define __NR_setresgid 119
269__SYSCALL(__NR_setresgid, sys_setresgid)
270
271#define __NR_getresgid 120
272__SYSCALL(__NR_getresgid, sys_getresgid)
273#define __NR_getpgid 121
274__SYSCALL(__NR_getpgid, sys_getpgid)
275#define __NR_setfsuid 122
276__SYSCALL(__NR_setfsuid, sys_setfsuid)
277#define __NR_setfsgid 123
278__SYSCALL(__NR_setfsgid, sys_setfsgid)
279#define __NR_getsid 124
280__SYSCALL(__NR_getsid, sys_getsid)
281#define __NR_capget 125
282__SYSCALL(__NR_capget, sys_capget)
283#define __NR_capset 126
284__SYSCALL(__NR_capset, sys_capset)
285
286#define __NR_rt_sigpending 127
287__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
288#define __NR_rt_sigtimedwait 128
289__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
290#define __NR_rt_sigqueueinfo 129
291__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
292#define __NR_rt_sigsuspend 130
293__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
294#define __NR_sigaltstack 131
295__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
296#define __NR_utime 132
297__SYSCALL(__NR_utime, sys_utime)
298#define __NR_mknod 133
299__SYSCALL(__NR_mknod, sys_mknod)
300
301/* Only needed for a.out */
302#define __NR_uselib 134
303__SYSCALL(__NR_uselib, sys_ni_syscall)
304#define __NR_personality 135
305__SYSCALL(__NR_personality, sys_personality)
306
307#define __NR_ustat 136
308__SYSCALL(__NR_ustat, sys_ustat)
309#define __NR_statfs 137
310__SYSCALL(__NR_statfs, sys_statfs)
311#define __NR_fstatfs 138
312__SYSCALL(__NR_fstatfs, sys_fstatfs)
313#define __NR_sysfs 139
314__SYSCALL(__NR_sysfs, sys_sysfs)
315
316#define __NR_getpriority 140
317__SYSCALL(__NR_getpriority, sys_getpriority)
318#define __NR_setpriority 141
319__SYSCALL(__NR_setpriority, sys_setpriority)
320#define __NR_sched_setparam 142
321__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
322#define __NR_sched_getparam 143
323__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
324#define __NR_sched_setscheduler 144
325__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
326#define __NR_sched_getscheduler 145
327__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
328#define __NR_sched_get_priority_max 146
329__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
330#define __NR_sched_get_priority_min 147
331__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
332#define __NR_sched_rr_get_interval 148
333__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
334
335#define __NR_mlock 149
336__SYSCALL(__NR_mlock, sys_mlock)
337#define __NR_munlock 150
338__SYSCALL(__NR_munlock, sys_munlock)
339#define __NR_mlockall 151
340__SYSCALL(__NR_mlockall, sys_mlockall)
341#define __NR_munlockall 152
342__SYSCALL(__NR_munlockall, sys_munlockall)
343
344#define __NR_vhangup 153
345__SYSCALL(__NR_vhangup, sys_vhangup)
346
347#define __NR_modify_ldt 154
348__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
349
350#define __NR_pivot_root 155
351__SYSCALL(__NR_pivot_root, sys_pivot_root)
352
353#define __NR__sysctl 156
354__SYSCALL(__NR__sysctl, sys_sysctl)
355
356#define __NR_prctl 157
357__SYSCALL(__NR_prctl, sys_prctl)
358#define __NR_arch_prctl 158
359__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
360
361#define __NR_adjtimex 159
362__SYSCALL(__NR_adjtimex, sys_adjtimex)
363
364#define __NR_setrlimit 160
365__SYSCALL(__NR_setrlimit, sys_setrlimit)
366
367#define __NR_chroot 161
368__SYSCALL(__NR_chroot, sys_chroot)
369
370#define __NR_sync 162
371__SYSCALL(__NR_sync, sys_sync)
372
373#define __NR_acct 163
374__SYSCALL(__NR_acct, sys_acct)
375
376#define __NR_settimeofday 164
377__SYSCALL(__NR_settimeofday, sys_settimeofday)
378
379#define __NR_mount 165
380__SYSCALL(__NR_mount, sys_mount)
381#define __NR_umount2 166
382__SYSCALL(__NR_umount2, sys_umount)
383
384#define __NR_swapon 167
385__SYSCALL(__NR_swapon, sys_swapon)
386#define __NR_swapoff 168
387__SYSCALL(__NR_swapoff, sys_swapoff)
388
389#define __NR_reboot 169
390__SYSCALL(__NR_reboot, sys_reboot)
391
392#define __NR_sethostname 170
393__SYSCALL(__NR_sethostname, sys_sethostname)
394#define __NR_setdomainname 171
395__SYSCALL(__NR_setdomainname, sys_setdomainname)
396
397#define __NR_iopl 172
398__SYSCALL(__NR_iopl, stub_iopl)
399#define __NR_ioperm 173
400__SYSCALL(__NR_ioperm, sys_ioperm)
401
402#define __NR_create_module 174
403__SYSCALL(__NR_create_module, sys_ni_syscall)
404#define __NR_init_module 175
405__SYSCALL(__NR_init_module, sys_init_module)
406#define __NR_delete_module 176
407__SYSCALL(__NR_delete_module, sys_delete_module)
408#define __NR_get_kernel_syms 177
409__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
410#define __NR_query_module 178
411__SYSCALL(__NR_query_module, sys_ni_syscall)
412
413#define __NR_quotactl 179
414__SYSCALL(__NR_quotactl, sys_quotactl)
415
416#define __NR_nfsservctl 180
417__SYSCALL(__NR_nfsservctl, sys_ni_syscall)
418
419/* reserved for LiS/STREAMS */
420#define __NR_getpmsg 181
421__SYSCALL(__NR_getpmsg, sys_ni_syscall)
422#define __NR_putpmsg 182
423__SYSCALL(__NR_putpmsg, sys_ni_syscall)
424
425/* reserved for AFS */
426#define __NR_afs_syscall 183
427__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
428
429/* reserved for tux */
430#define __NR_tuxcall 184
431__SYSCALL(__NR_tuxcall, sys_ni_syscall)
432
433#define __NR_security 185
434__SYSCALL(__NR_security, sys_ni_syscall)
435
436#define __NR_gettid 186
437__SYSCALL(__NR_gettid, sys_gettid)
438
439#define __NR_readahead 187
440__SYSCALL(__NR_readahead, sys_readahead)
441#define __NR_setxattr 188
442__SYSCALL(__NR_setxattr, sys_setxattr)
443#define __NR_lsetxattr 189
444__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
445#define __NR_fsetxattr 190
446__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
447#define __NR_getxattr 191
448__SYSCALL(__NR_getxattr, sys_getxattr)
449#define __NR_lgetxattr 192
450__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
451#define __NR_fgetxattr 193
452__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
453#define __NR_listxattr 194
454__SYSCALL(__NR_listxattr, sys_listxattr)
455#define __NR_llistxattr 195
456__SYSCALL(__NR_llistxattr, sys_llistxattr)
457#define __NR_flistxattr 196
458__SYSCALL(__NR_flistxattr, sys_flistxattr)
459#define __NR_removexattr 197
460__SYSCALL(__NR_removexattr, sys_removexattr)
461#define __NR_lremovexattr 198
462__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
463#define __NR_fremovexattr 199
464__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
465#define __NR_tkill 200
466__SYSCALL(__NR_tkill, sys_tkill)
467#define __NR_time 201
468__SYSCALL(__NR_time, sys_time)
469#define __NR_futex 202
470__SYSCALL(__NR_futex, sys_futex)
471#define __NR_sched_setaffinity 203
472__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
473#define __NR_sched_getaffinity 204
474__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
475#define __NR_set_thread_area 205
476__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */
477#define __NR_io_setup 206
478__SYSCALL(__NR_io_setup, sys_io_setup)
479#define __NR_io_destroy 207
480__SYSCALL(__NR_io_destroy, sys_io_destroy)
481#define __NR_io_getevents 208
482__SYSCALL(__NR_io_getevents, sys_io_getevents)
483#define __NR_io_submit 209
484__SYSCALL(__NR_io_submit, sys_io_submit)
485#define __NR_io_cancel 210
486__SYSCALL(__NR_io_cancel, sys_io_cancel)
487#define __NR_get_thread_area 211
488__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */
489#define __NR_lookup_dcookie 212
490__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
491#define __NR_epoll_create 213
492__SYSCALL(__NR_epoll_create, sys_epoll_create)
493#define __NR_epoll_ctl_old 214
494__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
495#define __NR_epoll_wait_old 215
496__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
497#define __NR_remap_file_pages 216
498__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
499#define __NR_getdents64 217
500__SYSCALL(__NR_getdents64, sys_getdents64)
501#define __NR_set_tid_address 218
502__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
503#define __NR_restart_syscall 219
504__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
505#define __NR_semtimedop 220
506__SYSCALL(__NR_semtimedop, sys_semtimedop)
507#define __NR_fadvise64 221
508__SYSCALL(__NR_fadvise64, sys_fadvise64)
509#define __NR_timer_create 222
510__SYSCALL(__NR_timer_create, sys_timer_create)
511#define __NR_timer_settime 223
512__SYSCALL(__NR_timer_settime, sys_timer_settime)
513#define __NR_timer_gettime 224
514__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
515#define __NR_timer_getoverrun 225
516__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
517#define __NR_timer_delete 226
518__SYSCALL(__NR_timer_delete, sys_timer_delete)
519#define __NR_clock_settime 227
520__SYSCALL(__NR_clock_settime, sys_clock_settime)
521#define __NR_clock_gettime 228
522__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
523#define __NR_clock_getres 229
524__SYSCALL(__NR_clock_getres, sys_clock_getres)
525#define __NR_clock_nanosleep 230
526__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
527#define __NR_exit_group 231
528__SYSCALL(__NR_exit_group, sys_exit_group)
529#define __NR_epoll_wait 232
530__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
531#define __NR_epoll_ctl 233
532__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
533#define __NR_tgkill 234
534__SYSCALL(__NR_tgkill, sys_tgkill)
535#define __NR_utimes 235
536__SYSCALL(__NR_utimes, sys_utimes)
537#define __NR_vserver 236
538__SYSCALL(__NR_vserver, sys_ni_syscall)
539#define __NR_mbind 237
540__SYSCALL(__NR_mbind, sys_mbind)
541#define __NR_set_mempolicy 238
542__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
543#define __NR_get_mempolicy 239
544__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
545#define __NR_mq_open 240
546__SYSCALL(__NR_mq_open, sys_mq_open)
547#define __NR_mq_unlink 241
548__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
549#define __NR_mq_timedsend 242
550__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
551#define __NR_mq_timedreceive 243
552__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
553#define __NR_mq_notify 244
554__SYSCALL(__NR_mq_notify, sys_mq_notify)
555#define __NR_mq_getsetattr 245
556__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
557#define __NR_kexec_load 246
558__SYSCALL(__NR_kexec_load, sys_kexec_load)
559#define __NR_waitid 247
560__SYSCALL(__NR_waitid, sys_waitid)
561#define __NR_add_key 248
562__SYSCALL(__NR_add_key, sys_add_key)
563#define __NR_request_key 249
564__SYSCALL(__NR_request_key, sys_request_key)
565#define __NR_keyctl 250
566__SYSCALL(__NR_keyctl, sys_keyctl)
567#define __NR_ioprio_set 251
568__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
569#define __NR_ioprio_get 252
570__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
571#define __NR_inotify_init 253
572__SYSCALL(__NR_inotify_init, sys_inotify_init)
573#define __NR_inotify_add_watch 254
574__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
575#define __NR_inotify_rm_watch 255
576__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
577#define __NR_migrate_pages 256
578__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
579#define __NR_openat 257
580__SYSCALL(__NR_openat, sys_openat)
581#define __NR_mkdirat 258
582__SYSCALL(__NR_mkdirat, sys_mkdirat)
583#define __NR_mknodat 259
584__SYSCALL(__NR_mknodat, sys_mknodat)
585#define __NR_fchownat 260
586__SYSCALL(__NR_fchownat, sys_fchownat)
587#define __NR_futimesat 261
588__SYSCALL(__NR_futimesat, sys_futimesat)
589#define __NR_newfstatat 262
590__SYSCALL(__NR_newfstatat, sys_newfstatat)
591#define __NR_unlinkat 263
592__SYSCALL(__NR_unlinkat, sys_unlinkat)
593#define __NR_renameat 264
594__SYSCALL(__NR_renameat, sys_renameat)
595#define __NR_linkat 265
596__SYSCALL(__NR_linkat, sys_linkat)
597#define __NR_symlinkat 266
598__SYSCALL(__NR_symlinkat, sys_symlinkat)
599#define __NR_readlinkat 267
600__SYSCALL(__NR_readlinkat, sys_readlinkat)
601#define __NR_fchmodat 268
602__SYSCALL(__NR_fchmodat, sys_fchmodat)
603#define __NR_faccessat 269
604__SYSCALL(__NR_faccessat, sys_faccessat)
605#define __NR_pselect6 270
606__SYSCALL(__NR_pselect6, sys_pselect6)
607#define __NR_ppoll 271
608__SYSCALL(__NR_ppoll, sys_ppoll)
609#define __NR_unshare 272
610__SYSCALL(__NR_unshare, sys_unshare)
611#define __NR_set_robust_list 273
612__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
613#define __NR_get_robust_list 274
614__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
615#define __NR_splice 275
616__SYSCALL(__NR_splice, sys_splice)
617#define __NR_tee 276
618__SYSCALL(__NR_tee, sys_tee)
619#define __NR_sync_file_range 277
620__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
621#define __NR_vmsplice 278
622__SYSCALL(__NR_vmsplice, sys_vmsplice)
623#define __NR_move_pages 279
624__SYSCALL(__NR_move_pages, sys_move_pages)
625#define __NR_utimensat 280
626__SYSCALL(__NR_utimensat, sys_utimensat)
627#define __NR_epoll_pwait 281
628__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
629#define __NR_signalfd 282
630__SYSCALL(__NR_signalfd, sys_signalfd)
631#define __NR_timerfd_create 283
632__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
633#define __NR_eventfd 284
634__SYSCALL(__NR_eventfd, sys_eventfd)
635#define __NR_fallocate 285
636__SYSCALL(__NR_fallocate, sys_fallocate)
637#define __NR_timerfd_settime 286
638__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
639#define __NR_timerfd_gettime 287
640__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
641#define __NR_accept4 288
642__SYSCALL(__NR_accept4, sys_accept4)
643#define __NR_signalfd4 289
644__SYSCALL(__NR_signalfd4, sys_signalfd4)
645#define __NR_eventfd2 290
646__SYSCALL(__NR_eventfd2, sys_eventfd2)
647#define __NR_epoll_create1 291
648__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
649#define __NR_dup3 292
650__SYSCALL(__NR_dup3, sys_dup3)
651#define __NR_pipe2 293
652__SYSCALL(__NR_pipe2, sys_pipe2)
653#define __NR_inotify_init1 294
654__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
655#define __NR_preadv 295
656__SYSCALL(__NR_preadv, sys_preadv)
657#define __NR_pwritev 296
658__SYSCALL(__NR_pwritev, sys_pwritev)
659#define __NR_rt_tgsigqueueinfo 297
660__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
661#define __NR_perf_event_open 298
662__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
663#define __NR_recvmmsg 299
664__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
665#define __NR_fanotify_init 300
666__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
667#define __NR_fanotify_mark 301
668__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
669#define __NR_prlimit64 302
670__SYSCALL(__NR_prlimit64, sys_prlimit64)
671#define __NR_name_to_handle_at 303
672__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
673#define __NR_open_by_handle_at 304
674__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
675#define __NR_clock_adjtime 305
676__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
677#define __NR_syncfs 306
678__SYSCALL(__NR_syncfs, sys_syncfs)
679#define __NR_sendmmsg 307
680__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
681#define __NR_setns 308
682__SYSCALL(__NR_setns, sys_setns)
683#define __NR_getcpu 309
684__SYSCALL(__NR_getcpu, sys_getcpu)
685#define __NR_process_vm_readv 310
686__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
687#define __NR_process_vm_writev 311
688__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
689
690#ifndef __NO_STUBS
691#define __ARCH_WANT_OLD_READDIR
692#define __ARCH_WANT_OLD_STAT
693#define __ARCH_WANT_SYS_ALARM
694#define __ARCH_WANT_SYS_GETHOSTNAME
695#define __ARCH_WANT_SYS_PAUSE
696#define __ARCH_WANT_SYS_SGETMASK
697#define __ARCH_WANT_SYS_SIGNAL
698#define __ARCH_WANT_SYS_UTIME
699#define __ARCH_WANT_SYS_WAITPID
700#define __ARCH_WANT_SYS_SOCKETCALL
701#define __ARCH_WANT_SYS_FADVISE64
702#define __ARCH_WANT_SYS_GETPGRP
703#define __ARCH_WANT_SYS_LLSEEK
704#define __ARCH_WANT_SYS_NICE
705#define __ARCH_WANT_SYS_OLD_GETRLIMIT
706#define __ARCH_WANT_SYS_OLD_UNAME
707#define __ARCH_WANT_SYS_OLDUMOUNT
708#define __ARCH_WANT_SYS_SIGPENDING
709#define __ARCH_WANT_SYS_SIGPROCMASK
710#define __ARCH_WANT_SYS_RT_SIGACTION
711#define __ARCH_WANT_SYS_RT_SIGSUSPEND
712#define __ARCH_WANT_SYS_TIME
713#define __ARCH_WANT_COMPAT_SYS_TIME
714#endif /* __NO_STUBS */
715
716#ifdef __KERNEL__
717
718#ifndef COMPILE_OFFSETS
719#include <asm/asm-offsets.h>
720#define NR_syscalls (__NR_syscall_max + 1)
721#endif
722
723/*
724 * "Conditional" syscalls
725 *
726 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
727 * but it doesn't work on all toolchains, so we just do it by hand
728 */
729#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
730#endif /* __KERNEL__ */
731
732#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 8e862aaf0d90..becf47b81735 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -65,7 +65,7 @@
65 * UV2: Bit 19 selects between 65 * UV2: Bit 19 selects between
66 * (0): 10 microsecond timebase and 66 * (0): 10 microsecond timebase and
67 * (1): 80 microseconds 67 * (1): 80 microseconds
68 * we're using 655us, similar to UV1: 65 units of 10us 68 * we're using 560us, similar to UV1: 65 units of 10us
69 */ 69 */
70#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) 70#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
71#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) 71#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL)
@@ -167,6 +167,7 @@
167#define FLUSH_RETRY_TIMEOUT 2 167#define FLUSH_RETRY_TIMEOUT 2
168#define FLUSH_GIVEUP 3 168#define FLUSH_GIVEUP 3
169#define FLUSH_COMPLETE 4 169#define FLUSH_COMPLETE 4
170#define FLUSH_RETRY_BUSYBUG 5
170 171
171/* 172/*
172 * tuning the action when the numalink network is extremely delayed 173 * tuning the action when the numalink network is extremely delayed
@@ -235,10 +236,10 @@ struct bau_msg_payload {
235 236
236 237
237/* 238/*
238 * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) 239 * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
239 * see table 4.2.3.0.1 in broacast_assist spec. 240 * see table 4.2.3.0.1 in broacast_assist spec.
240 */ 241 */
241struct bau_msg_header { 242struct uv1_bau_msg_header {
242 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 243 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
243 /* bits 5:0 */ 244 /* bits 5:0 */
244 unsigned int base_dest_nasid:15; /* nasid of the first bit */ 245 unsigned int base_dest_nasid:15; /* nasid of the first bit */
@@ -318,19 +319,87 @@ struct bau_msg_header {
318}; 319};
319 320
320/* 321/*
322 * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
323 * see figure 9-2 of harp_sys.pdf
324 */
325struct uv2_bau_msg_header {
326 unsigned int base_dest_nasid:15; /* nasid of the first bit */
327 /* bits 14:0 */ /* in uvhub map */
328 unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */
329 /* bits 19:15 */
330 unsigned int rsvd_1:1; /* must be zero */
331 /* bit 20 */
332 /* Address bits 59:21 */
333 /* bits 25:2 of address (44:21) are payload */
334 /* these next 24 bits become bytes 12-14 of msg */
335 /* bits 28:21 land in byte 12 */
336 unsigned int replied_to:1; /* sent as 0 by the source to
337 byte 12 */
338 /* bit 21 */
339 unsigned int msg_type:3; /* software type of the
340 message */
341 /* bits 24:22 */
342 unsigned int canceled:1; /* message canceled, resource
343 is to be freed*/
344 /* bit 25 */
345 unsigned int payload_1:3; /* not currently used */
346 /* bits 28:26 */
347
348 /* bits 36:29 land in byte 13 */
349 unsigned int payload_2a:3; /* not currently used */
350 unsigned int payload_2b:5; /* not currently used */
351 /* bits 36:29 */
352
353 /* bits 44:37 land in byte 14 */
354 unsigned int payload_3:8; /* not currently used */
355 /* bits 44:37 */
356
357 unsigned int rsvd_2:7; /* reserved */
358 /* bits 51:45 */
359 unsigned int swack_flag:1; /* software acknowledge flag */
360 /* bit 52 */
361 unsigned int rsvd_3a:3; /* must be zero */
362 unsigned int rsvd_3b:8; /* must be zero */
363 unsigned int rsvd_3c:8; /* must be zero */
364 unsigned int rsvd_3d:3; /* must be zero */
365 /* bits 74:53 */
366 unsigned int fairness:3; /* usually zero */
367 /* bits 77:75 */
368
369 unsigned int sequence:16; /* message sequence number */
370 /* bits 93:78 Suppl_A */
371 unsigned int chaining:1; /* next descriptor is part of
372 this activation*/
373 /* bit 94 */
374 unsigned int multilevel:1; /* multi-level multicast
375 format */
376 /* bit 95 */
377 unsigned int rsvd_4:24; /* ordered / source node /
378 source subnode / aging
379 must be zero */
380 /* bits 119:96 */
381 unsigned int command:8; /* message type */
382 /* bits 127:120 */
383};
384
385/*
321 * The activation descriptor: 386 * The activation descriptor:
322 * The format of the message to send, plus all accompanying control 387 * The format of the message to send, plus all accompanying control
323 * Should be 64 bytes 388 * Should be 64 bytes
324 */ 389 */
325struct bau_desc { 390struct bau_desc {
326 struct pnmask distribution; 391 struct pnmask distribution;
327 /* 392 /*
328 * message template, consisting of header and payload: 393 * message template, consisting of header and payload:
329 */ 394 */
330 struct bau_msg_header header; 395 union bau_msg_header {
331 struct bau_msg_payload payload; 396 struct uv1_bau_msg_header uv1_hdr;
397 struct uv2_bau_msg_header uv2_hdr;
398 } header;
399
400 struct bau_msg_payload payload;
332}; 401};
333/* 402/* UV1:
334 * -payload-- ---------header------ 403 * -payload-- ---------header------
335 * bytes 0-11 bits 41-56 bits 58-81 404 * bytes 0-11 bits 41-56 bits 58-81
336 * A B (2) C (3) 405 * A B (2) C (3)
@@ -340,6 +409,16 @@ struct bau_desc {
340 * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) 409 * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
341 * ------------payload queue----------- 410 * ------------payload queue-----------
342 */ 411 */
412/* UV2:
413 * -payload-- ---------header------
414 * bytes 0-11 bits 70-78 bits 21-44
415 * A B (2) C (3)
416 *
417 * A/B/C are moved to:
418 * A C B
419 * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
420 * ------------payload queue-----------
421 */
343 422
344/* 423/*
345 * The payload queue on the destination side is an array of these. 424 * The payload queue on the destination side is an array of these.
@@ -385,7 +464,6 @@ struct bau_pq_entry {
385struct msg_desc { 464struct msg_desc {
386 struct bau_pq_entry *msg; 465 struct bau_pq_entry *msg;
387 int msg_slot; 466 int msg_slot;
388 int swack_slot;
389 struct bau_pq_entry *queue_first; 467 struct bau_pq_entry *queue_first;
390 struct bau_pq_entry *queue_last; 468 struct bau_pq_entry *queue_last;
391}; 469};
@@ -405,6 +483,7 @@ struct ptc_stats {
405 requests */ 483 requests */
406 unsigned long s_stimeout; /* source side timeouts */ 484 unsigned long s_stimeout; /* source side timeouts */
407 unsigned long s_dtimeout; /* destination side timeouts */ 485 unsigned long s_dtimeout; /* destination side timeouts */
486 unsigned long s_strongnacks; /* number of strong nack's */
408 unsigned long s_time; /* time spent in sending side */ 487 unsigned long s_time; /* time spent in sending side */
409 unsigned long s_retriesok; /* successful retries */ 488 unsigned long s_retriesok; /* successful retries */
410 unsigned long s_ntargcpu; /* total number of cpu's 489 unsigned long s_ntargcpu; /* total number of cpu's
@@ -439,6 +518,9 @@ struct ptc_stats {
439 unsigned long s_retry_messages; /* retry broadcasts */ 518 unsigned long s_retry_messages; /* retry broadcasts */
440 unsigned long s_bau_reenabled; /* for bau enable/disable */ 519 unsigned long s_bau_reenabled; /* for bau enable/disable */
441 unsigned long s_bau_disabled; /* for bau enable/disable */ 520 unsigned long s_bau_disabled; /* for bau enable/disable */
521 unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */
522 unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */
523 unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */
442 /* destination statistics */ 524 /* destination statistics */
443 unsigned long d_alltlb; /* times all tlb's on this 525 unsigned long d_alltlb; /* times all tlb's on this
444 cpu were flushed */ 526 cpu were flushed */
@@ -511,9 +593,12 @@ struct bau_control {
511 short osnode; 593 short osnode;
512 short uvhub_cpu; 594 short uvhub_cpu;
513 short uvhub; 595 short uvhub;
596 short uvhub_version;
514 short cpus_in_socket; 597 short cpus_in_socket;
515 short cpus_in_uvhub; 598 short cpus_in_uvhub;
516 short partition_base_pnode; 599 short partition_base_pnode;
600 short using_desc; /* an index, like uvhub_cpu */
601 unsigned int inuse_map;
517 unsigned short message_number; 602 unsigned short message_number;
518 unsigned short uvhub_quiesce; 603 unsigned short uvhub_quiesce;
519 short socket_acknowledge_count[DEST_Q_SIZE]; 604 short socket_acknowledge_count[DEST_Q_SIZE];
@@ -531,6 +616,7 @@ struct bau_control {
531 int cong_response_us; 616 int cong_response_us;
532 int cong_reps; 617 int cong_reps;
533 int cong_period; 618 int cong_period;
619 unsigned long clocks_per_100_usec;
534 cycles_t period_time; 620 cycles_t period_time;
535 long period_requests; 621 long period_requests;
536 struct hub_and_pnode *thp; 622 struct hub_and_pnode *thp;
@@ -591,6 +677,11 @@ static inline void write_mmr_sw_ack(unsigned long mr)
591 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); 677 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
592} 678}
593 679
680static inline void write_gmmr_sw_ack(int pnode, unsigned long mr)
681{
682 write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
683}
684
594static inline unsigned long read_mmr_sw_ack(void) 685static inline unsigned long read_mmr_sw_ack(void)
595{ 686{
596 return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 687 return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 10474fb1185d..cf1d73643f60 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -57,6 +57,7 @@
57 57
58#define UV1_HUB_PART_NUMBER 0x88a5 58#define UV1_HUB_PART_NUMBER 0x88a5
59#define UV2_HUB_PART_NUMBER 0x8eb8 59#define UV2_HUB_PART_NUMBER 0x8eb8
60#define UV2_HUB_PART_NUMBER_X 0x1111
60 61
61/* Compat: if this #define is present, UV headers support UV2 */ 62/* Compat: if this #define is present, UV headers support UV2 */
62#define UV2_HUB_IS_SUPPORTED 1 63#define UV2_HUB_IS_SUPPORTED 1
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 1971e652d24b..517d4767ffdd 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -7,6 +7,7 @@
7struct mpc_bus; 7struct mpc_bus;
8struct mpc_cpu; 8struct mpc_cpu;
9struct mpc_table; 9struct mpc_table;
10struct cpuinfo_x86;
10 11
11/** 12/**
12 * struct x86_init_mpparse - platform specific mpparse ops 13 * struct x86_init_mpparse - platform specific mpparse ops
@@ -147,6 +148,7 @@ struct x86_init_ops {
147 */ 148 */
148struct x86_cpuinit_ops { 149struct x86_cpuinit_ops {
149 void (*setup_percpu_clockev)(void); 150 void (*setup_percpu_clockev)(void);
151 void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
150}; 152};
151 153
152/** 154/**
@@ -177,6 +179,7 @@ struct x86_msi_ops {
177 int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); 179 int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
178 void (*teardown_msi_irq)(unsigned int irq); 180 void (*teardown_msi_irq)(unsigned int irq);
179 void (*teardown_msi_irqs)(struct pci_dev *dev); 181 void (*teardown_msi_irqs)(struct pci_dev *dev);
182 void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
180}; 183};
181 184
182extern struct x86_init_ops x86_init; 185extern struct x86_init_ops x86_init;
@@ -186,5 +189,6 @@ extern struct x86_msi_ops x86_msi;
186 189
187extern void x86_init_noop(void); 190extern void x86_init_noop(void);
188extern void x86_init_uint_noop(unsigned int unused); 191extern void x86_init_uint_noop(unsigned int unused);
192extern void x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node);
189 193
190#endif 194#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871c..5369059c07a9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
25obj-y += probe_roms.o 25obj-y += probe_roms.o
26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
28obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 28obj-y += syscall_$(BITS).o
29obj-$(CONFIG_X86_64) += vsyscall_64.o
29obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 30obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
30obj-y += bootflag.o e820.o 31obj-y += bootflag.o e820.o
31obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 32obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
@@ -80,6 +81,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o
80obj-$(CONFIG_AMD_NB) += amd_nb.o 81obj-$(CONFIG_AMD_NB) += amd_nb.o
81obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 82obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
82obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 83obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
84obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
83 85
84obj-$(CONFIG_KVM_GUEST) += kvm.o 86obj-$(CONFIG_KVM_GUEST) += kvm.o
85obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 87obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4558f0d0822d..ce664f33ea8e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -219,6 +219,8 @@ static int __init
219acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) 219acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
220{ 220{
221 struct acpi_madt_local_x2apic *processor = NULL; 221 struct acpi_madt_local_x2apic *processor = NULL;
222 int apic_id;
223 u8 enabled;
222 224
223 processor = (struct acpi_madt_local_x2apic *)header; 225 processor = (struct acpi_madt_local_x2apic *)header;
224 226
@@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
227 229
228 acpi_table_print_madt_entry(header); 230 acpi_table_print_madt_entry(header);
229 231
232 apic_id = processor->local_apic_id;
233 enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
230#ifdef CONFIG_X86_X2APIC 234#ifdef CONFIG_X86_X2APIC
231 /* 235 /*
232 * We need to register disabled CPU as well to permit 236 * We need to register disabled CPU as well to permit
@@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
235 * to not preallocating memory for all NR_CPUS 239 * to not preallocating memory for all NR_CPUS
236 * when we use CPU hotplug. 240 * when we use CPU hotplug.
237 */ 241 */
238 acpi_register_lapic(processor->local_apic_id, /* APIC ID */ 242 if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
239 processor->lapic_flags & ACPI_MADT_ENABLED); 243 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
244 else
245 acpi_register_lapic(apic_id, enabled);
240#else 246#else
241 printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); 247 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
242#endif 248#endif
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa8facc..be16854591cc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,20 +119,49 @@ bool __init early_is_amd_nb(u32 device)
119 return false; 119 return false;
120} 120}
121 121
122struct resource *amd_get_mmconfig_range(struct resource *res)
123{
124 u32 address;
125 u64 base, msr;
126 unsigned segn_busn_bits;
127
128 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
129 return NULL;
130
131 /* assume all cpus from fam10h have mmconfig */
132 if (boot_cpu_data.x86 < 0x10)
133 return NULL;
134
135 address = MSR_FAM10H_MMIO_CONF_BASE;
136 rdmsrl(address, msr);
137
138 /* mmconfig is not enabled */
139 if (!(msr & FAM10H_MMIO_CONF_ENABLE))
140 return NULL;
141
142 base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
143
144 segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
145 FAM10H_MMIO_CONF_BUSRANGE_MASK;
146
147 res->flags = IORESOURCE_MEM;
148 res->start = base;
149 res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
150 return res;
151}
152
122int amd_get_subcaches(int cpu) 153int amd_get_subcaches(int cpu)
123{ 154{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; 155 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
125 unsigned int mask; 156 unsigned int mask;
126 int cuid = 0; 157 int cuid;
127 158
128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) 159 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
129 return 0; 160 return 0;
130 161
131 pci_read_config_dword(link, 0x1d4, &mask); 162 pci_read_config_dword(link, 0x1d4, &mask);
132 163
133#ifdef CONFIG_SMP
134 cuid = cpu_data(cpu).compute_unit_id; 164 cuid = cpu_data(cpu).compute_unit_id;
135#endif
136 return (mask >> (4 * cuid)) & 0xf; 165 return (mask >> (4 * cuid)) & 0xf;
137} 166}
138 167
@@ -141,7 +170,7 @@ int amd_set_subcaches(int cpu, int mask)
141 static unsigned int reset, ban; 170 static unsigned int reset, ban;
142 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); 171 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
143 unsigned int reg; 172 unsigned int reg;
144 int cuid = 0; 173 int cuid;
145 174
146 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) 175 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
147 return -EINVAL; 176 return -EINVAL;
@@ -159,9 +188,7 @@ int amd_set_subcaches(int cpu, int mask)
159 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); 188 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
160 } 189 }
161 190
162#ifdef CONFIG_SMP
163 cuid = cpu_data(cpu).compute_unit_id; 191 cuid = cpu_data(cpu).compute_unit_id;
164#endif
165 mask <<= 4 * cuid; 192 mask <<= 4 * cuid;
166 mask |= (0xf ^ (1 << cuid)) << 26; 193 mask |= (0xf ^ (1 << cuid)) << 26;
167 194
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 767fd04f2843..0ae0323b1f9c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_SMP) += ipi.o
10 10
11ifeq ($(CONFIG_X86_64),y) 11ifeq ($(CONFIG_X86_64),y)
12# APIC probe will depend on the listing order here 12# APIC probe will depend on the listing order here
13obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o 14obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o 15obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o 16obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f98d84caf94c..2eec05b6d1b8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer);
146int x2apic_mode; 146int x2apic_mode;
147#ifdef CONFIG_X86_X2APIC 147#ifdef CONFIG_X86_X2APIC
148/* x2apic enabled before OS handover */ 148/* x2apic enabled before OS handover */
149static int x2apic_preenabled; 149int x2apic_preenabled;
150static int x2apic_disabled;
151static int nox2apic;
150static __init int setup_nox2apic(char *str) 152static __init int setup_nox2apic(char *str)
151{ 153{
152 if (x2apic_enabled()) { 154 if (x2apic_enabled()) {
153 pr_warning("Bios already enabled x2apic, " 155 int apicid = native_apic_msr_read(APIC_ID);
154 "can't enforce nox2apic"); 156
155 return 0; 157 if (apicid >= 255) {
156 } 158 pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
159 apicid);
160 return 0;
161 }
162
163 pr_warning("x2apic already enabled. will disable it\n");
164 } else
165 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
166
167 nox2apic = 1;
157 168
158 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
159 return 0; 169 return 0;
160} 170}
161early_param("nox2apic", setup_nox2apic); 171early_param("nox2apic", setup_nox2apic);
@@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void)
250 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; 260 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
251 if (!send_status) 261 if (!send_status)
252 break; 262 break;
263 inc_irq_stat(icr_read_retry_count);
253 udelay(100); 264 udelay(100);
254 } while (timeout++ < 1000); 265 } while (timeout++ < 1000);
255 266
@@ -876,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
876 * Besides, if we don't timer interrupts ignore the global 887 * Besides, if we don't timer interrupts ignore the global
877 * interrupt lock, which is the WrongThing (tm) to do. 888 * interrupt lock, which is the WrongThing (tm) to do.
878 */ 889 */
879 exit_idle();
880 irq_enter(); 890 irq_enter();
891 exit_idle();
881 local_apic_timer_interrupt(); 892 local_apic_timer_interrupt();
882 irq_exit(); 893 irq_exit();
883 894
@@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void)
1431} 1442}
1432 1443
1433#ifdef CONFIG_X86_X2APIC 1444#ifdef CONFIG_X86_X2APIC
1445/*
1446 * Need to disable xapic and x2apic at the same time and then enable xapic mode
1447 */
1448static inline void __disable_x2apic(u64 msr)
1449{
1450 wrmsrl(MSR_IA32_APICBASE,
1451 msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
1452 wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
1453}
1454
1455static __init void disable_x2apic(void)
1456{
1457 u64 msr;
1458
1459 if (!cpu_has_x2apic)
1460 return;
1461
1462 rdmsrl(MSR_IA32_APICBASE, msr);
1463 if (msr & X2APIC_ENABLE) {
1464 u32 x2apic_id = read_apic_id();
1465
1466 if (x2apic_id >= 255)
1467 panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
1468
1469 pr_info("Disabling x2apic\n");
1470 __disable_x2apic(msr);
1471
1472 if (nox2apic) {
1473 clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
1474 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
1475 }
1476
1477 x2apic_disabled = 1;
1478 x2apic_mode = 0;
1479
1480 register_lapic_address(mp_lapic_addr);
1481 }
1482}
1483
1434void check_x2apic(void) 1484void check_x2apic(void)
1435{ 1485{
1436 if (x2apic_enabled()) { 1486 if (x2apic_enabled()) {
@@ -1441,15 +1491,20 @@ void check_x2apic(void)
1441 1491
1442void enable_x2apic(void) 1492void enable_x2apic(void)
1443{ 1493{
1444 int msr, msr2; 1494 u64 msr;
1495
1496 rdmsrl(MSR_IA32_APICBASE, msr);
1497 if (x2apic_disabled) {
1498 __disable_x2apic(msr);
1499 return;
1500 }
1445 1501
1446 if (!x2apic_mode) 1502 if (!x2apic_mode)
1447 return; 1503 return;
1448 1504
1449 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1450 if (!(msr & X2APIC_ENABLE)) { 1505 if (!(msr & X2APIC_ENABLE)) {
1451 printk_once(KERN_INFO "Enabling x2apic\n"); 1506 printk_once(KERN_INFO "Enabling x2apic\n");
1452 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); 1507 wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
1453 } 1508 }
1454} 1509}
1455#endif /* CONFIG_X86_X2APIC */ 1510#endif /* CONFIG_X86_X2APIC */
@@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void)
1486 ret = save_ioapic_entries(); 1541 ret = save_ioapic_entries();
1487 if (ret) { 1542 if (ret) {
1488 pr_info("Saving IO-APIC state failed: %d\n", ret); 1543 pr_info("Saving IO-APIC state failed: %d\n", ret);
1489 goto out; 1544 return;
1490 } 1545 }
1491 1546
1492 local_irq_save(flags); 1547 local_irq_save(flags);
1493 legacy_pic->mask_all(); 1548 legacy_pic->mask_all();
1494 mask_ioapic_entries(); 1549 mask_ioapic_entries();
1495 1550
1551 if (x2apic_preenabled && nox2apic)
1552 disable_x2apic();
1553
1496 if (dmar_table_init_ret) 1554 if (dmar_table_init_ret)
1497 ret = -1; 1555 ret = -1;
1498 else 1556 else
1499 ret = enable_IR(); 1557 ret = enable_IR();
1500 1558
1559 if (!x2apic_supported())
1560 goto skip_x2apic;
1561
1501 if (ret < 0) { 1562 if (ret < 0) {
1502 /* IR is required if there is APIC ID > 255 even when running 1563 /* IR is required if there is APIC ID > 255 even when running
1503 * under KVM 1564 * under KVM
1504 */ 1565 */
1505 if (max_physical_apicid > 255 || 1566 if (max_physical_apicid > 255 ||
1506 !hypervisor_x2apic_available()) 1567 !hypervisor_x2apic_available()) {
1507 goto nox2apic; 1568 if (x2apic_preenabled)
1569 disable_x2apic();
1570 goto skip_x2apic;
1571 }
1508 /* 1572 /*
1509 * without IR all CPUs can be addressed by IOAPIC/MSI 1573 * without IR all CPUs can be addressed by IOAPIC/MSI
1510 * only in physical mode 1574 * only in physical mode
@@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void)
1512 x2apic_force_phys(); 1576 x2apic_force_phys();
1513 } 1577 }
1514 1578
1515 if (ret == IRQ_REMAP_XAPIC_MODE) 1579 if (ret == IRQ_REMAP_XAPIC_MODE) {
1516 goto nox2apic; 1580 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1581 goto skip_x2apic;
1582 }
1517 1583
1518 x2apic_enabled = 1; 1584 x2apic_enabled = 1;
1519 1585
@@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void)
1523 pr_info("Enabled x2apic\n"); 1589 pr_info("Enabled x2apic\n");
1524 } 1590 }
1525 1591
1526nox2apic: 1592skip_x2apic:
1527 if (ret < 0) /* IR enabling failed */ 1593 if (ret < 0) /* IR enabling failed */
1528 restore_ioapic_entries(); 1594 restore_ioapic_entries();
1529 legacy_pic->restore_mask(); 1595 legacy_pic->restore_mask();
1530 local_irq_restore(flags); 1596 local_irq_restore(flags);
1531
1532out:
1533 if (x2apic_enabled || !x2apic_supported())
1534 return;
1535
1536 if (x2apic_preenabled)
1537 panic("x2apic: enabled by BIOS but kernel init failed.");
1538 else if (ret == IRQ_REMAP_XAPIC_MODE)
1539 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1540 else if (ret < 0)
1541 pr_info("x2apic not enabled, IRQ remapping init failed\n");
1542} 1597}
1543 1598
1544#ifdef CONFIG_X86_64 1599#ifdef CONFIG_X86_64
@@ -1809,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1809{ 1864{
1810 u32 v; 1865 u32 v;
1811 1866
1812 exit_idle();
1813 irq_enter(); 1867 irq_enter();
1868 exit_idle();
1814 /* 1869 /*
1815 * Check if this really is a spurious interrupt and ACK it 1870 * Check if this really is a spurious interrupt and ACK it
1816 * if it is a vectored one. Just in case... 1871 * if it is a vectored one. Just in case...
@@ -1846,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs)
1846 "Illegal register address", /* APIC Error Bit 7 */ 1901 "Illegal register address", /* APIC Error Bit 7 */
1847 }; 1902 };
1848 1903
1849 exit_idle();
1850 irq_enter(); 1904 irq_enter();
1905 exit_idle();
1851 /* First tickle the hardware, only then report what went on. -- REW */ 1906 /* First tickle the hardware, only then report what went on. -- REW */
1852 v0 = apic_read(APIC_ESR); 1907 v0 = apic_read(APIC_ESR);
1853 apic_write(APIC_ESR, 0); 1908 apic_write(APIC_ESR, 0);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f7a41e4cae47..8c3cdded6f2b 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
62 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 62 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
63 * document number 292116). So here it goes... 63 * document number 292116). So here it goes...
64 */ 64 */
65static void flat_init_apic_ldr(void) 65void flat_init_apic_ldr(void)
66{ 66{
67 unsigned long val; 67 unsigned long val;
68 unsigned long num, id; 68 unsigned long num, id;
@@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
171 return initial_apic_id >> index_msb; 171 return initial_apic_id >> index_msb;
172} 172}
173 173
174static int flat_probe(void)
175{
176 return 1;
177}
178
174static struct apic apic_flat = { 179static struct apic apic_flat = {
175 .name = "flat", 180 .name = "flat",
176 .probe = NULL, 181 .probe = flat_probe,
177 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 182 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
178 .apic_id_registered = flat_apic_id_registered, 183 .apic_id_registered = flat_apic_id_registered,
179 184
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 000000000000..09d3d8c1cd99
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,294 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Numascale NumaConnect-Specific APIC Code
7 *
8 * Copyright (C) 2011 Numascale AS. All rights reserved.
9 *
10 * Send feedback to <support@numascale.com>
11 *
12 */
13
14#include <linux/errno.h>
15#include <linux/threads.h>
16#include <linux/cpumask.h>
17#include <linux/string.h>
18#include <linux/kernel.h>
19#include <linux/module.h>
20#include <linux/ctype.h>
21#include <linux/init.h>
22#include <linux/hardirq.h>
23#include <linux/delay.h>
24
25#include <asm/numachip/numachip_csr.h>
26#include <asm/smp.h>
27#include <asm/apic.h>
28#include <asm/ipi.h>
29#include <asm/apic_flat_64.h>
30
31static int numachip_system __read_mostly;
32
33static struct apic apic_numachip __read_mostly;
34
35static unsigned int get_apic_id(unsigned long x)
36{
37 unsigned long value;
38 unsigned int id;
39
40 rdmsrl(MSR_FAM10H_NODE_ID, value);
41 id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
42
43 return id;
44}
45
46static unsigned long set_apic_id(unsigned int id)
47{
48 unsigned long x;
49
50 x = ((id & 0xffU) << 24);
51 return x;
52}
53
54static unsigned int read_xapic_id(void)
55{
56 return get_apic_id(apic_read(APIC_ID));
57}
58
59static int numachip_apic_id_registered(void)
60{
61 return physid_isset(read_xapic_id(), phys_cpu_present_map);
62}
63
64static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
65{
66 return initial_apic_id >> index_msb;
67}
68
69static const struct cpumask *numachip_target_cpus(void)
70{
71 return cpu_online_mask;
72}
73
74static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
75{
76 cpumask_clear(retmask);
77 cpumask_set_cpu(cpu, retmask);
78}
79
80static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
81{
82 union numachip_csr_g3_ext_irq_gen int_gen;
83
84 int_gen.s._destination_apic_id = phys_apicid;
85 int_gen.s._vector = 0;
86 int_gen.s._msgtype = APIC_DM_INIT >> 8;
87 int_gen.s._index = 0;
88
89 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
90
91 int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
92 int_gen.s._vector = start_rip >> 12;
93
94 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
95
96 atomic_set(&init_deasserted, 1);
97 return 0;
98}
99
100static void numachip_send_IPI_one(int cpu, int vector)
101{
102 union numachip_csr_g3_ext_irq_gen int_gen;
103 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
104
105 int_gen.s._destination_apic_id = apicid;
106 int_gen.s._vector = vector;
107 int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
108 int_gen.s._index = 0;
109
110 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
111}
112
113static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
114{
115 unsigned int cpu;
116
117 for_each_cpu(cpu, mask)
118 numachip_send_IPI_one(cpu, vector);
119}
120
121static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
122 int vector)
123{
124 unsigned int this_cpu = smp_processor_id();
125 unsigned int cpu;
126
127 for_each_cpu(cpu, mask) {
128 if (cpu != this_cpu)
129 numachip_send_IPI_one(cpu, vector);
130 }
131}
132
133static void numachip_send_IPI_allbutself(int vector)
134{
135 unsigned int this_cpu = smp_processor_id();
136 unsigned int cpu;
137
138 for_each_online_cpu(cpu) {
139 if (cpu != this_cpu)
140 numachip_send_IPI_one(cpu, vector);
141 }
142}
143
144static void numachip_send_IPI_all(int vector)
145{
146 numachip_send_IPI_mask(cpu_online_mask, vector);
147}
148
149static void numachip_send_IPI_self(int vector)
150{
151 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
152}
153
154static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
155{
156 int cpu;
157
158 /*
159 * We're using fixed IRQ delivery, can only return one phys APIC ID.
160 * May as well be the first.
161 */
162 cpu = cpumask_first(cpumask);
163 if (likely((unsigned)cpu < nr_cpu_ids))
164 return per_cpu(x86_cpu_to_apicid, cpu);
165
166 return BAD_APICID;
167}
168
169static unsigned int
170numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
171 const struct cpumask *andmask)
172{
173 int cpu;
174
175 /*
176 * We're using fixed IRQ delivery, can only return one phys APIC ID.
177 * May as well be the first.
178 */
179 for_each_cpu_and(cpu, cpumask, andmask) {
180 if (cpumask_test_cpu(cpu, cpu_online_mask))
181 break;
182 }
183 return per_cpu(x86_cpu_to_apicid, cpu);
184}
185
186static int __init numachip_probe(void)
187{
188 return apic == &apic_numachip;
189}
190
191static void __init map_csrs(void)
192{
193 printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
194 NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
195 init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
196
197 printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
198 NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
199 init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
200}
201
202static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
203{
204 c->phys_proc_id = node;
205 per_cpu(cpu_llc_id, smp_processor_id()) = node;
206}
207
208static int __init numachip_system_init(void)
209{
210 unsigned int val;
211
212 if (!numachip_system)
213 return 0;
214
215 x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
216
217 map_csrs();
218
219 val = read_lcsr(CSR_G0_NODE_IDS);
220 printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
221
222 return 0;
223}
224early_initcall(numachip_system_init);
225
226static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
227{
228 if (!strncmp(oem_id, "NUMASC", 6)) {
229 numachip_system = 1;
230 return 1;
231 }
232
233 return 0;
234}
235
236static struct apic apic_numachip __refconst = {
237
238 .name = "NumaConnect system",
239 .probe = numachip_probe,
240 .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
241 .apic_id_registered = numachip_apic_id_registered,
242
243 .irq_delivery_mode = dest_Fixed,
244 .irq_dest_mode = 0, /* physical */
245
246 .target_cpus = numachip_target_cpus,
247 .disable_esr = 0,
248 .dest_logical = 0,
249 .check_apicid_used = NULL,
250 .check_apicid_present = NULL,
251
252 .vector_allocation_domain = numachip_vector_allocation_domain,
253 .init_apic_ldr = flat_init_apic_ldr,
254
255 .ioapic_phys_id_map = NULL,
256 .setup_apic_routing = NULL,
257 .multi_timer_check = NULL,
258 .cpu_present_to_apicid = default_cpu_present_to_apicid,
259 .apicid_to_cpu_present = NULL,
260 .setup_portio_remap = NULL,
261 .check_phys_apicid_present = default_check_phys_apicid_present,
262 .enable_apic_mode = NULL,
263 .phys_pkg_id = numachip_phys_pkg_id,
264 .mps_oem_check = NULL,
265
266 .get_apic_id = get_apic_id,
267 .set_apic_id = set_apic_id,
268 .apic_id_mask = 0xffU << 24,
269
270 .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid,
271 .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
272
273 .send_IPI_mask = numachip_send_IPI_mask,
274 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
275 .send_IPI_allbutself = numachip_send_IPI_allbutself,
276 .send_IPI_all = numachip_send_IPI_all,
277 .send_IPI_self = numachip_send_IPI_self,
278
279 .wakeup_secondary_cpu = numachip_wakeup_secondary,
280 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
281 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
282 .wait_for_init_deassert = NULL,
283 .smp_callin_clear_local_apic = NULL,
284 .inquire_remote_apic = NULL, /* REMRD not supported */
285
286 .read = native_apic_mem_read,
287 .write = native_apic_mem_write,
288 .icr_read = native_apic_icr_read,
289 .icr_write = native_apic_icr_write,
290 .wait_icr_idle = native_apic_wait_icr_idle,
291 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
292};
293apic_driver(apic_numachip);
294
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6d939d7847e2..fb072754bc1d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2421 unsigned vector, me; 2421 unsigned vector, me;
2422 2422
2423 ack_APIC_irq(); 2423 ack_APIC_irq();
2424 exit_idle();
2425 irq_enter(); 2424 irq_enter();
2425 exit_idle();
2426 2426
2427 me = smp_processor_id(); 2427 me = smp_processor_id();
2428 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2428 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -2948,6 +2948,10 @@ static inline void __init check_timer(void)
2948 } 2948 }
2949 local_irq_disable(); 2949 local_irq_disable();
2950 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); 2950 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2951 if (x2apic_preenabled)
2952 apic_printk(APIC_QUIET, KERN_INFO
2953 "Perhaps problem with the pre-enabled x2apic mode\n"
2954 "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
2951 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2955 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2952 "report. Then try booting with the 'noapic' option.\n"); 2956 "report. Then try booting with the 'noapic' option.\n");
2953out: 2957out:
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 62ae3001ae02..79b05b88aa19 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -93,6 +93,8 @@ static int __init early_get_pnodeid(void)
93 93
94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER) 94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; 95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
96 if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X)
97 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
96 98
97 uv_hub_info->hub_revision = uv_min_hub_revision_id; 99 uv_hub_info->hub_revision = uv_min_hub_revision_id;
98 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); 100 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
@@ -767,7 +769,12 @@ void __init uv_system_init(void)
767 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 769 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
768 uv_possible_blades += 770 uv_possible_blades +=
769 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); 771 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
770 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); 772
773 /* uv_num_possible_blades() is really the hub count */
774 printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
775 is_uv1_hub() ? uv_num_possible_blades() :
776 (uv_num_possible_blades() + 1) / 2,
777 uv_num_possible_blades());
771 778
772 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 779 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
773 uv_blade_info = kzalloc(bytes, GFP_KERNEL); 780 uv_blade_info = kzalloc(bytes, GFP_KERNEL);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a46bd383953c..f76623cbe263 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -383,21 +383,21 @@ static int ignore_sys_suspend;
383static int ignore_normal_resume; 383static int ignore_normal_resume;
384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
385 385
386static int debug __read_mostly; 386static bool debug __read_mostly;
387static int smp __read_mostly; 387static bool smp __read_mostly;
388static int apm_disabled = -1; 388static int apm_disabled = -1;
389#ifdef CONFIG_SMP 389#ifdef CONFIG_SMP
390static int power_off; 390static bool power_off;
391#else 391#else
392static int power_off = 1; 392static bool power_off = 1;
393#endif 393#endif
394static int realmode_power_off; 394static bool realmode_power_off;
395#ifdef CONFIG_APM_ALLOW_INTS 395#ifdef CONFIG_APM_ALLOW_INTS
396static int allow_ints = 1; 396static bool allow_ints = 1;
397#else 397#else
398static int allow_ints; 398static bool allow_ints;
399#endif 399#endif
400static int broken_psr; 400static bool broken_psr;
401 401
402static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 4f13fafc5264..68de2dc962ec 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,4 +67,6 @@ void common(void) {
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version); 68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); 69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70 OFFSET(BP_pref_address, boot_params, hdr.pref_address);
71 OFFSET(BP_code32_start, boot_params, hdr.code32_start);
70} 72}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 395a10e68067..85d98ab15cdc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -3,6 +3,11 @@
3#include <linux/lguest.h> 3#include <linux/lguest.h>
4#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
5 5
6#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
7static char syscalls[] = {
8#include <asm/syscalls_32.h>
9};
10
6/* workaround for a warning with -Wmissing-prototypes */ 11/* workaround for a warning with -Wmissing-prototypes */
7void foo(void); 12void foo(void);
8 13
@@ -76,4 +81,7 @@ void foo(void)
76 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 81 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
77 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 82 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
78#endif 83#endif
84 BLANK();
85 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
86 DEFINE(NR_syscalls, sizeof(syscalls));
79} 87}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72a1194af22..834e897b1e25 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,11 +1,12 @@
1#include <asm/ia32.h> 1#include <asm/ia32.h>
2 2
3#define __NO_STUBS 1 3#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
4#undef __SYSCALL 4static char syscalls_64[] = {
5#undef _ASM_X86_UNISTD_64_H 5#include <asm/syscalls_64.h>
6#define __SYSCALL(nr, sym) [nr] = 1, 6};
7static char syscalls[] = { 7#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
8#include <asm/unistd.h> 8static char syscalls_ia32[] = {
9#include <asm/syscalls_32.h>
9}; 10};
10 11
11int main(void) 12int main(void)
@@ -72,7 +73,11 @@ int main(void)
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 73 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
73 BLANK(); 74 BLANK();
74 75
75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); 76 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
77 DEFINE(NR_syscalls, sizeof(syscalls_64));
78
79 DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
80 DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
76 81
77 return 0; 82 return 0;
78} 83}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c7e46cb35327..f4773f4aae35 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
148 148
149static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) 149static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
150{ 150{
151#ifdef CONFIG_SMP
152 /* calling is from identify_secondary_cpu() ? */ 151 /* calling is from identify_secondary_cpu() ? */
153 if (!c->cpu_index) 152 if (!c->cpu_index)
154 return; 153 return;
@@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
192 191
193valid_k7: 192valid_k7:
194 ; 193 ;
195#endif
196} 194}
197 195
198static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 196static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
@@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
353 if (node == NUMA_NO_NODE) 351 if (node == NUMA_NO_NODE)
354 node = per_cpu(cpu_llc_id, cpu); 352 node = per_cpu(cpu_llc_id, cpu);
355 353
354 /*
355 * If core numbers are inconsistent, it's likely a multi-fabric platform,
356 * so invoke platform-specific handler
357 */
358 if (c->phys_proc_id != node)
359 x86_cpuinit.fixup_cpu_id(c, node);
360
356 if (!node_online(node)) { 361 if (!node_online(node)) {
357 /* 362 /*
358 * Two possibilities here: 363 * Two possibilities here:
@@ -442,8 +447,6 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
442 447
443static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 448static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
444{ 449{
445 u32 dummy;
446
447 early_init_amd_mc(c); 450 early_init_amd_mc(c);
448 451
449 /* 452 /*
@@ -473,12 +476,12 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
473 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 476 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
474 } 477 }
475#endif 478#endif
476
477 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
478} 479}
479 480
480static void __cpuinit init_amd(struct cpuinfo_x86 *c) 481static void __cpuinit init_amd(struct cpuinfo_x86 *c)
481{ 482{
483 u32 dummy;
484
482#ifdef CONFIG_SMP 485#ifdef CONFIG_SMP
483 unsigned long long value; 486 unsigned long long value;
484 487
@@ -657,6 +660,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
657 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); 660 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
658 } 661 }
659 } 662 }
663
664 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
660} 665}
661 666
662#ifdef CONFIG_X86_32 667#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e58d978e0758..159103c0b1f4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
278 } 278 }
279#ifdef CONFIG_X86_32 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 9) { 281 if (c->x86_model >= 6 && c->x86_model <= 13) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
283 lo |= (1<<1 | 1<<7); 283 lo |= (1<<1 | 1<<7);
284 wrmsr(MSR_VIA_FCR, lo, hi); 284 wrmsr(MSR_VIA_FCR, lo, hi);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa003b13a831..d43cad74f166 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
676 if (this_cpu->c_early_init) 676 if (this_cpu->c_early_init)
677 this_cpu->c_early_init(c); 677 this_cpu->c_early_init(c);
678 678
679#ifdef CONFIG_SMP
680 c->cpu_index = 0; 679 c->cpu_index = 0;
681#endif
682 filter_cpuid_features(c, false); 680 filter_cpuid_features(c, false);
683 681
684 setup_smep(c); 682 setup_smep(c);
@@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
764 c->apicid = c->initial_apicid; 762 c->apicid = c->initial_apicid;
765# endif 763# endif
766#endif 764#endif
767
768#ifdef CONFIG_X86_HT
769 c->phys_proc_id = c->initial_apicid; 765 c->phys_proc_id = c->initial_apicid;
770#endif
771 } 766 }
772 767
773 setup_smep(c); 768 setup_smep(c);
@@ -1026,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid);
1026 1021
1027#ifdef CONFIG_X86_64 1022#ifdef CONFIG_X86_64
1028struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; 1023struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
1024struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
1025 (unsigned long) nmi_idt_table };
1029 1026
1030DEFINE_PER_CPU_FIRST(union irq_stack_union, 1027DEFINE_PER_CPU_FIRST(union irq_stack_union,
1031 irq_stack_union) __aligned(PAGE_SIZE); 1028 irq_stack_union) __aligned(PAGE_SIZE);
@@ -1090,6 +1087,26 @@ unsigned long kernel_eflags;
1090 */ 1087 */
1091DEFINE_PER_CPU(struct orig_ist, orig_ist); 1088DEFINE_PER_CPU(struct orig_ist, orig_ist);
1092 1089
1090static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
1091DEFINE_PER_CPU(int, debug_stack_usage);
1092
1093int is_debug_stack(unsigned long addr)
1094{
1095 return __get_cpu_var(debug_stack_usage) ||
1096 (addr <= __get_cpu_var(debug_stack_addr) &&
1097 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
1098}
1099
1100void debug_stack_set_zero(void)
1101{
1102 load_idt((const struct desc_ptr *)&nmi_idt_descr);
1103}
1104
1105void debug_stack_reset(void)
1106{
1107 load_idt((const struct desc_ptr *)&idt_descr);
1108}
1109
1093#else /* CONFIG_X86_64 */ 1110#else /* CONFIG_X86_64 */
1094 1111
1095DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1112DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
@@ -1141,6 +1158,15 @@ static void dbg_restore_debug_regs(void)
1141#endif /* ! CONFIG_KGDB */ 1158#endif /* ! CONFIG_KGDB */
1142 1159
1143/* 1160/*
1161 * Prints an error where the NUMA and configured core-number mismatch and the
1162 * platform didn't override this to fix it up
1163 */
1164void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
1165{
1166 pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
1167}
1168
1169/*
1144 * cpu_init() initializes state that is per-CPU. Some data is already 1170 * cpu_init() initializes state that is per-CPU. Some data is already
1145 * initialized (naturally) in the bootstrap process, such as the GDT 1171 * initialized (naturally) in the bootstrap process, such as the GDT
1146 * and IDT. We reload them nevertheless, this function acts as a 1172 * and IDT. We reload them nevertheless, this function acts as a
@@ -1208,6 +1234,8 @@ void __cpuinit cpu_init(void)
1208 estacks += exception_stack_sizes[v]; 1234 estacks += exception_stack_sizes[v];
1209 oist->ist[v] = t->x86_tss.ist[v] = 1235 oist->ist[v] = t->x86_tss.ist[v] =
1210 (unsigned long)estacks; 1236 (unsigned long)estacks;
1237 if (v == DEBUG_STACK-1)
1238 per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
1211 } 1239 }
1212 } 1240 }
1213 1241
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1b22dcc51af4..8bacc7826fb3 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,5 +1,4 @@
1#ifndef ARCH_X86_CPU_H 1#ifndef ARCH_X86_CPU_H
2
3#define ARCH_X86_CPU_H 2#define ARCH_X86_CPU_H
4 3
5struct cpu_model_info { 4struct cpu_model_info {
@@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
35 34
36extern void get_cpu_cap(struct cpuinfo_x86 *c); 35extern void get_cpu_cap(struct cpuinfo_x86 *c);
37extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 36extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
38extern void get_cpu_cap(struct cpuinfo_x86 *c); 37#endif /* ARCH_X86_CPU_H */
39
40#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 523131213f08..3e6ff6cbf42a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void)
181 181
182static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) 182static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
183{ 183{
184#ifdef CONFIG_SMP
185 /* calling is from identify_secondary_cpu() ? */ 184 /* calling is from identify_secondary_cpu() ? */
186 if (!c->cpu_index) 185 if (!c->cpu_index)
187 return; 186 return;
@@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
198 WARN_ONCE(1, "WARNING: SMP operation may be unreliable" 197 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
199 "with B stepping processors.\n"); 198 "with B stepping processors.\n");
200 } 199 }
201#endif
202} 200}
203 201
204static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 202static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811693c9..6b45e5e7a901 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
844 844
845#include <linux/kobject.h> 845#include <linux/kobject.h>
846#include <linux/sysfs.h> 846#include <linux/sysfs.h>
847 847#include <linux/cpu.h>
848extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
849 848
850/* pointer to kobject for cpuX/cache */ 849/* pointer to kobject for cpuX/cache */
851static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); 850static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1072,9 @@ err_out:
1073static DECLARE_BITMAP(cache_dev_map, NR_CPUS); 1072static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
1074 1073
1075/* Add/Remove cache interface for CPU device */ 1074/* Add/Remove cache interface for CPU device */
1076static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 1075static int __cpuinit cache_add_dev(struct device *dev)
1077{ 1076{
1078 unsigned int cpu = sys_dev->id; 1077 unsigned int cpu = dev->id;
1079 unsigned long i, j; 1078 unsigned long i, j;
1080 struct _index_kobject *this_object; 1079 struct _index_kobject *this_object;
1081 struct _cpuid4_info *this_leaf; 1080 struct _cpuid4_info *this_leaf;
@@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1087 1086
1088 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), 1087 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
1089 &ktype_percpu_entry, 1088 &ktype_percpu_entry,
1090 &sys_dev->kobj, "%s", "cache"); 1089 &dev->kobj, "%s", "cache");
1091 if (retval < 0) { 1090 if (retval < 0) {
1092 cpuid4_cache_sysfs_exit(cpu); 1091 cpuid4_cache_sysfs_exit(cpu);
1093 return retval; 1092 return retval;
@@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1124 return 0; 1123 return 0;
1125} 1124}
1126 1125
1127static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 1126static void __cpuinit cache_remove_dev(struct device *dev)
1128{ 1127{
1129 unsigned int cpu = sys_dev->id; 1128 unsigned int cpu = dev->id;
1130 unsigned long i; 1129 unsigned long i;
1131 1130
1132 if (per_cpu(ici_cpuid4_info, cpu) == NULL) 1131 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
1145 unsigned long action, void *hcpu) 1144 unsigned long action, void *hcpu)
1146{ 1145{
1147 unsigned int cpu = (unsigned long)hcpu; 1146 unsigned int cpu = (unsigned long)hcpu;
1148 struct sys_device *sys_dev; 1147 struct device *dev;
1149 1148
1150 sys_dev = get_cpu_sysdev(cpu); 1149 dev = get_cpu_device(cpu);
1151 switch (action) { 1150 switch (action) {
1152 case CPU_ONLINE: 1151 case CPU_ONLINE:
1153 case CPU_ONLINE_FROZEN: 1152 case CPU_ONLINE_FROZEN:
1154 cache_add_dev(sys_dev); 1153 cache_add_dev(dev);
1155 break; 1154 break;
1156 case CPU_DEAD: 1155 case CPU_DEAD:
1157 case CPU_DEAD_FROZEN: 1156 case CPU_DEAD_FROZEN:
1158 cache_remove_dev(sys_dev); 1157 cache_remove_dev(dev);
1159 break; 1158 break;
1160 } 1159 }
1161 return NOTIFY_OK; 1160 return NOTIFY_OK;
@@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)
1174 1173
1175 for_each_online_cpu(i) { 1174 for_each_online_cpu(i) {
1176 int err; 1175 int err;
1177 struct sys_device *sys_dev = get_cpu_sysdev(i); 1176 struct device *dev = get_cpu_device(i);
1178 1177
1179 err = cache_add_dev(sys_dev); 1178 err = cache_add_dev(dev);
1180 if (err) 1179 if (err)
1181 return err; 1180 return err;
1182 } 1181 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 319882ef848d..fc4beb393577 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -17,6 +17,7 @@
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/preempt.h>
20#include <linux/smp.h> 21#include <linux/smp.h>
21#include <linux/notifier.h> 22#include <linux/notifier.h>
22#include <linux/kdebug.h> 23#include <linux/kdebug.h>
@@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
92 return NMI_HANDLED; 93 return NMI_HANDLED;
93} 94}
94 95
96static void mce_irq_ipi(void *info)
97{
98 int cpu = smp_processor_id();
99 struct mce *m = &__get_cpu_var(injectm);
100
101 if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
102 m->inject_flags & MCJ_EXCEPTION) {
103 cpumask_clear_cpu(cpu, mce_inject_cpumask);
104 raise_exception(m, NULL);
105 }
106}
107
95/* Inject mce on current CPU */ 108/* Inject mce on current CPU */
96static int raise_local(void) 109static int raise_local(void)
97{ 110{
@@ -139,9 +152,10 @@ static void raise_mce(struct mce *m)
139 return; 152 return;
140 153
141#ifdef CONFIG_X86_LOCAL_APIC 154#ifdef CONFIG_X86_LOCAL_APIC
142 if (m->inject_flags & MCJ_NMI_BROADCAST) { 155 if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
143 unsigned long start; 156 unsigned long start;
144 int cpu; 157 int cpu;
158
145 get_online_cpus(); 159 get_online_cpus();
146 cpumask_copy(mce_inject_cpumask, cpu_online_mask); 160 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
147 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); 161 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
@@ -151,13 +165,25 @@ static void raise_mce(struct mce *m)
151 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 165 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
152 cpumask_clear_cpu(cpu, mce_inject_cpumask); 166 cpumask_clear_cpu(cpu, mce_inject_cpumask);
153 } 167 }
154 if (!cpumask_empty(mce_inject_cpumask)) 168 if (!cpumask_empty(mce_inject_cpumask)) {
155 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); 169 if (m->inject_flags & MCJ_IRQ_BRAODCAST) {
170 /*
171 * don't wait because mce_irq_ipi is necessary
172 * to be sync with following raise_local
173 */
174 preempt_disable();
175 smp_call_function_many(mce_inject_cpumask,
176 mce_irq_ipi, NULL, 0);
177 preempt_enable();
178 } else if (m->inject_flags & MCJ_NMI_BROADCAST)
179 apic->send_IPI_mask(mce_inject_cpumask,
180 NMI_VECTOR);
181 }
156 start = jiffies; 182 start = jiffies;
157 while (!cpumask_empty(mce_inject_cpumask)) { 183 while (!cpumask_empty(mce_inject_cpumask)) {
158 if (!time_before(jiffies, start + 2*HZ)) { 184 if (!time_before(jiffies, start + 2*HZ)) {
159 printk(KERN_ERR 185 printk(KERN_ERR
160 "Timeout waiting for mce inject NMI %lx\n", 186 "Timeout waiting for mce inject %lx\n",
161 *cpumask_bits(mce_inject_cpumask)); 187 *cpumask_bits(mce_inject_cpumask));
162 break; 188 break;
163 } 189 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69ee8b5..ed44c8a65858 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
1#include <linux/sysdev.h> 1#include <linux/device.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4enum severity_level { 4enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
17struct mce_bank { 17struct mce_bank {
18 u64 ctl; /* subevents to enable */ 18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */ 19 unsigned char init; /* initialise bank? */
20 struct sysdev_attribute attr; /* sysdev attribute */ 20 struct device_attribute attr; /* device attribute */
21 char attrname[ATTR_LEN]; /* attribute name */ 21 char attrname[ATTR_LEN]; /* attribute name */
22}; 22};
23 23
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2af127d4c3d1..5a11ae2e9e91 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/string.h> 21#include <linux/string.h>
22#include <linux/sysdev.h> 22#include <linux/device.h>
23#include <linux/syscore_ops.h> 23#include <linux/syscore_ops.h>
24#include <linux/delay.h> 24#include <linux/delay.h>
25#include <linux/ctype.h> 25#include <linux/ctype.h>
@@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
95static DEFINE_PER_CPU(struct mce, mces_seen); 95static DEFINE_PER_CPU(struct mce, mces_seen);
96static int cpu_missing; 96static int cpu_missing;
97 97
98/*
99 * CPU/chipset specific EDAC code can register a notifier call here to print
100 * MCE errors in a human-readable form.
101 */
102ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
103EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
104
105/* MCA banks polled by the period polling timer for corrected events */ 98/* MCA banks polled by the period polling timer for corrected events */
106DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 99DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
107 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 100 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
109 102
110static DEFINE_PER_CPU(struct work_struct, mce_work); 103static DEFINE_PER_CPU(struct work_struct, mce_work);
111 104
105/*
106 * CPU/chipset specific EDAC code can register a notifier call here to print
107 * MCE errors in a human-readable form.
108 */
109ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
110
112/* Do initial initialization of a struct mce */ 111/* Do initial initialization of a struct mce */
113void mce_setup(struct mce *m) 112void mce_setup(struct mce *m)
114{ 113{
@@ -119,9 +118,7 @@ void mce_setup(struct mce *m)
119 m->time = get_seconds(); 118 m->time = get_seconds();
120 m->cpuvendor = boot_cpu_data.x86_vendor; 119 m->cpuvendor = boot_cpu_data.x86_vendor;
121 m->cpuid = cpuid_eax(1); 120 m->cpuid = cpuid_eax(1);
122#ifdef CONFIG_SMP
123 m->socketid = cpu_data(m->extcpu).phys_proc_id; 121 m->socketid = cpu_data(m->extcpu).phys_proc_id;
124#endif
125 m->apicid = cpu_data(m->extcpu).initial_apicid; 122 m->apicid = cpu_data(m->extcpu).initial_apicid;
126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 123 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
127} 124}
@@ -190,6 +187,57 @@ void mce_log(struct mce *mce)
190 set_bit(0, &mce_need_notify); 187 set_bit(0, &mce_need_notify);
191} 188}
192 189
190static void drain_mcelog_buffer(void)
191{
192 unsigned int next, i, prev = 0;
193
194 next = rcu_dereference_check_mce(mcelog.next);
195
196 do {
197 struct mce *m;
198
199 /* drain what was logged during boot */
200 for (i = prev; i < next; i++) {
201 unsigned long start = jiffies;
202 unsigned retries = 1;
203
204 m = &mcelog.entry[i];
205
206 while (!m->finished) {
207 if (time_after_eq(jiffies, start + 2*retries))
208 retries++;
209
210 cpu_relax();
211
212 if (!m->finished && retries >= 4) {
213 pr_err("MCE: skipping error being logged currently!\n");
214 break;
215 }
216 }
217 smp_rmb();
218 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
219 }
220
221 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
222 prev = next;
223 next = cmpxchg(&mcelog.next, prev, 0);
224 } while (next != prev);
225}
226
227
228void mce_register_decode_chain(struct notifier_block *nb)
229{
230 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
231 drain_mcelog_buffer();
232}
233EXPORT_SYMBOL_GPL(mce_register_decode_chain);
234
235void mce_unregister_decode_chain(struct notifier_block *nb)
236{
237 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
238}
239EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
240
193static void print_mce(struct mce *m) 241static void print_mce(struct mce *m)
194{ 242{
195 int ret = 0; 243 int ret = 0;
@@ -1770,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = {
1770}; 1818};
1771 1819
1772/* 1820/*
1773 * mce_sysdev: Sysfs support 1821 * mce_device: Sysfs support
1774 */ 1822 */
1775 1823
1776static void mce_cpu_restart(void *data) 1824static void mce_cpu_restart(void *data)
@@ -1806,27 +1854,28 @@ static void mce_enable_ce(void *all)
1806 __mcheck_cpu_init_timer(); 1854 __mcheck_cpu_init_timer();
1807} 1855}
1808 1856
1809static struct sysdev_class mce_sysdev_class = { 1857static struct bus_type mce_subsys = {
1810 .name = "machinecheck", 1858 .name = "machinecheck",
1859 .dev_name = "machinecheck",
1811}; 1860};
1812 1861
1813DEFINE_PER_CPU(struct sys_device, mce_sysdev); 1862struct device *mce_device[CONFIG_NR_CPUS];
1814 1863
1815__cpuinitdata 1864__cpuinitdata
1816void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1865void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1817 1866
1818static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1867static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
1819{ 1868{
1820 return container_of(attr, struct mce_bank, attr); 1869 return container_of(attr, struct mce_bank, attr);
1821} 1870}
1822 1871
1823static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1872static ssize_t show_bank(struct device *s, struct device_attribute *attr,
1824 char *buf) 1873 char *buf)
1825{ 1874{
1826 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1875 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1827} 1876}
1828 1877
1829static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1878static ssize_t set_bank(struct device *s, struct device_attribute *attr,
1830 const char *buf, size_t size) 1879 const char *buf, size_t size)
1831{ 1880{
1832 u64 new; 1881 u64 new;
@@ -1841,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1841} 1890}
1842 1891
1843static ssize_t 1892static ssize_t
1844show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1893show_trigger(struct device *s, struct device_attribute *attr, char *buf)
1845{ 1894{
1846 strcpy(buf, mce_helper); 1895 strcpy(buf, mce_helper);
1847 strcat(buf, "\n"); 1896 strcat(buf, "\n");
1848 return strlen(mce_helper) + 1; 1897 return strlen(mce_helper) + 1;
1849} 1898}
1850 1899
1851static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1900static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
1852 const char *buf, size_t siz) 1901 const char *buf, size_t siz)
1853{ 1902{
1854 char *p; 1903 char *p;
@@ -1863,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1863 return strlen(mce_helper) + !!p; 1912 return strlen(mce_helper) + !!p;
1864} 1913}
1865 1914
1866static ssize_t set_ignore_ce(struct sys_device *s, 1915static ssize_t set_ignore_ce(struct device *s,
1867 struct sysdev_attribute *attr, 1916 struct device_attribute *attr,
1868 const char *buf, size_t size) 1917 const char *buf, size_t size)
1869{ 1918{
1870 u64 new; 1919 u64 new;
@@ -1887,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
1887 return size; 1936 return size;
1888} 1937}
1889 1938
1890static ssize_t set_cmci_disabled(struct sys_device *s, 1939static ssize_t set_cmci_disabled(struct device *s,
1891 struct sysdev_attribute *attr, 1940 struct device_attribute *attr,
1892 const char *buf, size_t size) 1941 const char *buf, size_t size)
1893{ 1942{
1894 u64 new; 1943 u64 new;
@@ -1910,108 +1959,117 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
1910 return size; 1959 return size;
1911} 1960}
1912 1961
1913static ssize_t store_int_with_restart(struct sys_device *s, 1962static ssize_t store_int_with_restart(struct device *s,
1914 struct sysdev_attribute *attr, 1963 struct device_attribute *attr,
1915 const char *buf, size_t size) 1964 const char *buf, size_t size)
1916{ 1965{
1917 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1966 ssize_t ret = device_store_int(s, attr, buf, size);
1918 mce_restart(); 1967 mce_restart();
1919 return ret; 1968 return ret;
1920} 1969}
1921 1970
1922static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1971static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
1923static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1972static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
1924static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1973static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1925static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1974static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1926 1975
1927static struct sysdev_ext_attribute attr_check_interval = { 1976static struct dev_ext_attribute dev_attr_check_interval = {
1928 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1977 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
1929 store_int_with_restart),
1930 &check_interval 1978 &check_interval
1931}; 1979};
1932 1980
1933static struct sysdev_ext_attribute attr_ignore_ce = { 1981static struct dev_ext_attribute dev_attr_ignore_ce = {
1934 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1982 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
1935 &mce_ignore_ce 1983 &mce_ignore_ce
1936}; 1984};
1937 1985
1938static struct sysdev_ext_attribute attr_cmci_disabled = { 1986static struct dev_ext_attribute dev_attr_cmci_disabled = {
1939 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1987 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
1940 &mce_cmci_disabled 1988 &mce_cmci_disabled
1941}; 1989};
1942 1990
1943static struct sysdev_attribute *mce_sysdev_attrs[] = { 1991static struct device_attribute *mce_device_attrs[] = {
1944 &attr_tolerant.attr, 1992 &dev_attr_tolerant.attr,
1945 &attr_check_interval.attr, 1993 &dev_attr_check_interval.attr,
1946 &attr_trigger, 1994 &dev_attr_trigger,
1947 &attr_monarch_timeout.attr, 1995 &dev_attr_monarch_timeout.attr,
1948 &attr_dont_log_ce.attr, 1996 &dev_attr_dont_log_ce.attr,
1949 &attr_ignore_ce.attr, 1997 &dev_attr_ignore_ce.attr,
1950 &attr_cmci_disabled.attr, 1998 &dev_attr_cmci_disabled.attr,
1951 NULL 1999 NULL
1952}; 2000};
1953 2001
1954static cpumask_var_t mce_sysdev_initialized; 2002static cpumask_var_t mce_device_initialized;
2003
2004static void mce_device_release(struct device *dev)
2005{
2006 kfree(dev);
2007}
1955 2008
1956/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 2009/* Per cpu device init. All of the cpus still share the same ctrl bank: */
1957static __cpuinit int mce_sysdev_create(unsigned int cpu) 2010static __cpuinit int mce_device_create(unsigned int cpu)
1958{ 2011{
1959 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 2012 struct device *dev;
1960 int err; 2013 int err;
1961 int i, j; 2014 int i, j;
1962 2015
1963 if (!mce_available(&boot_cpu_data)) 2016 if (!mce_available(&boot_cpu_data))
1964 return -EIO; 2017 return -EIO;
1965 2018
1966 memset(&sysdev->kobj, 0, sizeof(struct kobject)); 2019 dev = kzalloc(sizeof *dev, GFP_KERNEL);
1967 sysdev->id = cpu; 2020 if (!dev)
1968 sysdev->cls = &mce_sysdev_class; 2021 return -ENOMEM;
2022 dev->id = cpu;
2023 dev->bus = &mce_subsys;
2024 dev->release = &mce_device_release;
1969 2025
1970 err = sysdev_register(sysdev); 2026 err = device_register(dev);
1971 if (err) 2027 if (err)
1972 return err; 2028 return err;
1973 2029
1974 for (i = 0; mce_sysdev_attrs[i]; i++) { 2030 for (i = 0; mce_device_attrs[i]; i++) {
1975 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); 2031 err = device_create_file(dev, mce_device_attrs[i]);
1976 if (err) 2032 if (err)
1977 goto error; 2033 goto error;
1978 } 2034 }
1979 for (j = 0; j < banks; j++) { 2035 for (j = 0; j < banks; j++) {
1980 err = sysdev_create_file(sysdev, &mce_banks[j].attr); 2036 err = device_create_file(dev, &mce_banks[j].attr);
1981 if (err) 2037 if (err)
1982 goto error2; 2038 goto error2;
1983 } 2039 }
1984 cpumask_set_cpu(cpu, mce_sysdev_initialized); 2040 cpumask_set_cpu(cpu, mce_device_initialized);
2041 mce_device[cpu] = dev;
1985 2042
1986 return 0; 2043 return 0;
1987error2: 2044error2:
1988 while (--j >= 0) 2045 while (--j >= 0)
1989 sysdev_remove_file(sysdev, &mce_banks[j].attr); 2046 device_remove_file(dev, &mce_banks[j].attr);
1990error: 2047error:
1991 while (--i >= 0) 2048 while (--i >= 0)
1992 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 2049 device_remove_file(dev, mce_device_attrs[i]);
1993 2050
1994 sysdev_unregister(sysdev); 2051 device_unregister(dev);
1995 2052
1996 return err; 2053 return err;
1997} 2054}
1998 2055
1999static __cpuinit void mce_sysdev_remove(unsigned int cpu) 2056static __cpuinit void mce_device_remove(unsigned int cpu)
2000{ 2057{
2001 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 2058 struct device *dev = mce_device[cpu];
2002 int i; 2059 int i;
2003 2060
2004 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) 2061 if (!cpumask_test_cpu(cpu, mce_device_initialized))
2005 return; 2062 return;
2006 2063
2007 for (i = 0; mce_sysdev_attrs[i]; i++) 2064 for (i = 0; mce_device_attrs[i]; i++)
2008 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 2065 device_remove_file(dev, mce_device_attrs[i]);
2009 2066
2010 for (i = 0; i < banks; i++) 2067 for (i = 0; i < banks; i++)
2011 sysdev_remove_file(sysdev, &mce_banks[i].attr); 2068 device_remove_file(dev, &mce_banks[i].attr);
2012 2069
2013 sysdev_unregister(sysdev); 2070 device_unregister(dev);
2014 cpumask_clear_cpu(cpu, mce_sysdev_initialized); 2071 cpumask_clear_cpu(cpu, mce_device_initialized);
2072 mce_device[cpu] = NULL;
2015} 2073}
2016 2074
2017/* Make sure there are no machine checks on offlined CPUs. */ 2075/* Make sure there are no machine checks on offlined CPUs. */
@@ -2061,7 +2119,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2061 switch (action) { 2119 switch (action) {
2062 case CPU_ONLINE: 2120 case CPU_ONLINE:
2063 case CPU_ONLINE_FROZEN: 2121 case CPU_ONLINE_FROZEN:
2064 mce_sysdev_create(cpu); 2122 mce_device_create(cpu);
2065 if (threshold_cpu_callback) 2123 if (threshold_cpu_callback)
2066 threshold_cpu_callback(action, cpu); 2124 threshold_cpu_callback(action, cpu);
2067 break; 2125 break;
@@ -2069,7 +2127,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2069 case CPU_DEAD_FROZEN: 2127 case CPU_DEAD_FROZEN:
2070 if (threshold_cpu_callback) 2128 if (threshold_cpu_callback)
2071 threshold_cpu_callback(action, cpu); 2129 threshold_cpu_callback(action, cpu);
2072 mce_sysdev_remove(cpu); 2130 mce_device_remove(cpu);
2073 break; 2131 break;
2074 case CPU_DOWN_PREPARE: 2132 case CPU_DOWN_PREPARE:
2075 case CPU_DOWN_PREPARE_FROZEN: 2133 case CPU_DOWN_PREPARE_FROZEN:
@@ -2103,7 +2161,7 @@ static __init void mce_init_banks(void)
2103 2161
2104 for (i = 0; i < banks; i++) { 2162 for (i = 0; i < banks; i++) {
2105 struct mce_bank *b = &mce_banks[i]; 2163 struct mce_bank *b = &mce_banks[i];
2106 struct sysdev_attribute *a = &b->attr; 2164 struct device_attribute *a = &b->attr;
2107 2165
2108 sysfs_attr_init(&a->attr); 2166 sysfs_attr_init(&a->attr);
2109 a->attr.name = b->attrname; 2167 a->attr.name = b->attrname;
@@ -2123,16 +2181,16 @@ static __init int mcheck_init_device(void)
2123 if (!mce_available(&boot_cpu_data)) 2181 if (!mce_available(&boot_cpu_data))
2124 return -EIO; 2182 return -EIO;
2125 2183
2126 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); 2184 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
2127 2185
2128 mce_init_banks(); 2186 mce_init_banks();
2129 2187
2130 err = sysdev_class_register(&mce_sysdev_class); 2188 err = subsys_system_register(&mce_subsys, NULL);
2131 if (err) 2189 if (err)
2132 return err; 2190 return err;
2133 2191
2134 for_each_online_cpu(i) { 2192 for_each_online_cpu(i) {
2135 err = mce_sysdev_create(i); 2193 err = mce_device_create(i);
2136 if (err) 2194 if (err)
2137 return err; 2195 return err;
2138 } 2196 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f5474218cffe..786e76a86322 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
17#include <linux/notifier.h> 17#include <linux/notifier.h>
18#include <linux/kobject.h> 18#include <linux/kobject.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/sysdev.h>
21#include <linux/errno.h> 20#include <linux/errno.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
23#include <linux/sysfs.h> 22#include <linux/sysfs.h>
@@ -64,11 +63,9 @@ struct threshold_bank {
64}; 63};
65static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); 64static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
66 65
67#ifdef CONFIG_SMP
68static unsigned char shared_bank[NR_BANKS] = { 66static unsigned char shared_bank[NR_BANKS] = {
69 0, 0, 0, 0, 1 67 0, 0, 0, 0, 1
70}; 68};
71#endif
72 69
73static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 70static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
74 71
@@ -202,10 +199,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
202 199
203 if (!block) 200 if (!block)
204 per_cpu(bank_map, cpu) |= (1 << bank); 201 per_cpu(bank_map, cpu) |= (1 << bank);
205#ifdef CONFIG_SMP
206 if (shared_bank[bank] && c->cpu_core_id) 202 if (shared_bank[bank] && c->cpu_core_id)
207 break; 203 break;
208#endif 204
209 offset = setup_APIC_mce(offset, 205 offset = setup_APIC_mce(offset,
210 (high & MASK_LVTOFF_HI) >> 20); 206 (high & MASK_LVTOFF_HI) >> 20);
211 207
@@ -527,11 +523,11 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527{ 523{
528 int i, err = 0; 524 int i, err = 0;
529 struct threshold_bank *b = NULL; 525 struct threshold_bank *b = NULL;
526 struct device *dev = mce_device[cpu];
530 char name[32]; 527 char name[32];
531 528
532 sprintf(name, "threshold_bank%i", bank); 529 sprintf(name, "threshold_bank%i", bank);
533 530
534#ifdef CONFIG_SMP
535 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 531 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
536 i = cpumask_first(cpu_llc_shared_mask(cpu)); 532 i = cpumask_first(cpu_llc_shared_mask(cpu));
537 533
@@ -548,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
548 if (!b) 544 if (!b)
549 goto out; 545 goto out;
550 546
551 err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, 547 err = sysfs_create_link(&dev->kobj, b->kobj, name);
552 b->kobj, name);
553 if (err) 548 if (err)
554 goto out; 549 goto out;
555 550
@@ -558,7 +553,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
558 553
559 goto out; 554 goto out;
560 } 555 }
561#endif
562 556
563 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); 557 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
564 if (!b) { 558 if (!b) {
@@ -571,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
571 goto out; 565 goto out;
572 } 566 }
573 567
574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); 568 b->kobj = kobject_create_and_add(name, &dev->kobj);
575 if (!b->kobj) 569 if (!b->kobj)
576 goto out_free; 570 goto out_free;
577 571
@@ -591,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
591 if (i == cpu) 585 if (i == cpu)
592 continue; 586 continue;
593 587
594 err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, 588 dev = mce_device[i];
595 b->kobj, name); 589 if (dev)
590 err = sysfs_create_link(&dev->kobj,b->kobj, name);
596 if (err) 591 if (err)
597 goto out; 592 goto out;
598 593
@@ -655,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu,
655static void threshold_remove_bank(unsigned int cpu, int bank) 650static void threshold_remove_bank(unsigned int cpu, int bank)
656{ 651{
657 struct threshold_bank *b; 652 struct threshold_bank *b;
653 struct device *dev;
658 char name[32]; 654 char name[32];
659 int i = 0; 655 int i = 0;
660 656
@@ -669,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
669#ifdef CONFIG_SMP 665#ifdef CONFIG_SMP
670 /* sibling symlink */ 666 /* sibling symlink */
671 if (shared_bank[bank] && b->blocks->cpu != cpu) { 667 if (shared_bank[bank] && b->blocks->cpu != cpu) {
672 sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); 668 sysfs_remove_link(&mce_device[cpu]->kobj, name);
673 per_cpu(threshold_banks, cpu)[bank] = NULL; 669 per_cpu(threshold_banks, cpu)[bank] = NULL;
674 670
675 return; 671 return;
@@ -681,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
681 if (i == cpu) 677 if (i == cpu)
682 continue; 678 continue;
683 679
684 sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); 680 dev = mce_device[i];
681 if (dev)
682 sysfs_remove_link(&dev->kobj, name);
685 per_cpu(threshold_banks, i)[bank] = NULL; 683 per_cpu(threshold_banks, i)[bank] = NULL;
686 } 684 }
687 685
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c84ea6..67bb17a37a0a 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/sysdev.h>
23#include <linux/types.h> 22#include <linux/types.h>
24#include <linux/init.h> 23#include <linux/init.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0);
69static u32 lvtthmr_init __read_mostly; 68static u32 lvtthmr_init __read_mostly;
70 69
71#ifdef CONFIG_SYSFS 70#ifdef CONFIG_SYSFS
72#define define_therm_throt_sysdev_one_ro(_name) \ 71#define define_therm_throt_device_one_ro(_name) \
73 static SYSDEV_ATTR(_name, 0444, \ 72 static DEVICE_ATTR(_name, 0444, \
74 therm_throt_sysdev_show_##_name, \ 73 therm_throt_device_show_##_name, \
75 NULL) \ 74 NULL) \
76 75
77#define define_therm_throt_sysdev_show_func(event, name) \ 76#define define_therm_throt_device_show_func(event, name) \
78 \ 77 \
79static ssize_t therm_throt_sysdev_show_##event##_##name( \ 78static ssize_t therm_throt_device_show_##event##_##name( \
80 struct sys_device *dev, \ 79 struct device *dev, \
81 struct sysdev_attribute *attr, \ 80 struct device_attribute *attr, \
82 char *buf) \ 81 char *buf) \
83{ \ 82{ \
84 unsigned int cpu = dev->id; \ 83 unsigned int cpu = dev->id; \
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \
95 return ret; \ 94 return ret; \
96} 95}
97 96
98define_therm_throt_sysdev_show_func(core_throttle, count); 97define_therm_throt_device_show_func(core_throttle, count);
99define_therm_throt_sysdev_one_ro(core_throttle_count); 98define_therm_throt_device_one_ro(core_throttle_count);
100 99
101define_therm_throt_sysdev_show_func(core_power_limit, count); 100define_therm_throt_device_show_func(core_power_limit, count);
102define_therm_throt_sysdev_one_ro(core_power_limit_count); 101define_therm_throt_device_one_ro(core_power_limit_count);
103 102
104define_therm_throt_sysdev_show_func(package_throttle, count); 103define_therm_throt_device_show_func(package_throttle, count);
105define_therm_throt_sysdev_one_ro(package_throttle_count); 104define_therm_throt_device_one_ro(package_throttle_count);
106 105
107define_therm_throt_sysdev_show_func(package_power_limit, count); 106define_therm_throt_device_show_func(package_power_limit, count);
108define_therm_throt_sysdev_one_ro(package_power_limit_count); 107define_therm_throt_device_one_ro(package_power_limit_count);
109 108
110static struct attribute *thermal_throttle_attrs[] = { 109static struct attribute *thermal_throttle_attrs[] = {
111 &attr_core_throttle_count.attr, 110 &dev_attr_core_throttle_count.attr,
112 NULL 111 NULL
113}; 112};
114 113
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
223 222
224#ifdef CONFIG_SYSFS 223#ifdef CONFIG_SYSFS
225/* Add/Remove thermal_throttle interface for CPU device: */ 224/* Add/Remove thermal_throttle interface for CPU device: */
226static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, 225static __cpuinit int thermal_throttle_add_dev(struct device *dev,
227 unsigned int cpu) 226 unsigned int cpu)
228{ 227{
229 int err; 228 int err;
230 struct cpuinfo_x86 *c = &cpu_data(cpu); 229 struct cpuinfo_x86 *c = &cpu_data(cpu);
231 230
232 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); 231 err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
233 if (err) 232 if (err)
234 return err; 233 return err;
235 234
236 if (cpu_has(c, X86_FEATURE_PLN)) 235 if (cpu_has(c, X86_FEATURE_PLN))
237 err = sysfs_add_file_to_group(&sys_dev->kobj, 236 err = sysfs_add_file_to_group(&dev->kobj,
238 &attr_core_power_limit_count.attr, 237 &dev_attr_core_power_limit_count.attr,
239 thermal_attr_group.name); 238 thermal_attr_group.name);
240 if (cpu_has(c, X86_FEATURE_PTS)) { 239 if (cpu_has(c, X86_FEATURE_PTS)) {
241 err = sysfs_add_file_to_group(&sys_dev->kobj, 240 err = sysfs_add_file_to_group(&dev->kobj,
242 &attr_package_throttle_count.attr, 241 &dev_attr_package_throttle_count.attr,
243 thermal_attr_group.name); 242 thermal_attr_group.name);
244 if (cpu_has(c, X86_FEATURE_PLN)) 243 if (cpu_has(c, X86_FEATURE_PLN))
245 err = sysfs_add_file_to_group(&sys_dev->kobj, 244 err = sysfs_add_file_to_group(&dev->kobj,
246 &attr_package_power_limit_count.attr, 245 &dev_attr_package_power_limit_count.attr,
247 thermal_attr_group.name); 246 thermal_attr_group.name);
248 } 247 }
249 248
250 return err; 249 return err;
251} 250}
252 251
253static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 252static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
254{ 253{
255 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); 254 sysfs_remove_group(&dev->kobj, &thermal_attr_group);
256} 255}
257 256
258/* Mutex protecting device creation against CPU hotplug: */ 257/* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
265 void *hcpu) 264 void *hcpu)
266{ 265{
267 unsigned int cpu = (unsigned long)hcpu; 266 unsigned int cpu = (unsigned long)hcpu;
268 struct sys_device *sys_dev; 267 struct device *dev;
269 int err = 0; 268 int err = 0;
270 269
271 sys_dev = get_cpu_sysdev(cpu); 270 dev = get_cpu_device(cpu);
272 271
273 switch (action) { 272 switch (action) {
274 case CPU_UP_PREPARE: 273 case CPU_UP_PREPARE:
275 case CPU_UP_PREPARE_FROZEN: 274 case CPU_UP_PREPARE_FROZEN:
276 mutex_lock(&therm_cpu_lock); 275 mutex_lock(&therm_cpu_lock);
277 err = thermal_throttle_add_dev(sys_dev, cpu); 276 err = thermal_throttle_add_dev(dev, cpu);
278 mutex_unlock(&therm_cpu_lock); 277 mutex_unlock(&therm_cpu_lock);
279 WARN_ON(err); 278 WARN_ON(err);
280 break; 279 break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
283 case CPU_DEAD: 282 case CPU_DEAD:
284 case CPU_DEAD_FROZEN: 283 case CPU_DEAD_FROZEN:
285 mutex_lock(&therm_cpu_lock); 284 mutex_lock(&therm_cpu_lock);
286 thermal_throttle_remove_dev(sys_dev); 285 thermal_throttle_remove_dev(dev);
287 mutex_unlock(&therm_cpu_lock); 286 mutex_unlock(&therm_cpu_lock);
288 break; 287 break;
289 } 288 }
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
310#endif 309#endif
311 /* connect live CPUs to sysfs */ 310 /* connect live CPUs to sysfs */
312 for_each_online_cpu(cpu) { 311 for_each_online_cpu(cpu) {
313 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); 312 err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
314 WARN_ON(err); 313 WARN_ON(err);
315 } 314 }
316#ifdef CONFIG_HOTPLUG_CPU 315#ifdef CONFIG_HOTPLUG_CPU
@@ -323,17 +322,6 @@ device_initcall(thermal_throttle_init_device);
323 322
324#endif /* CONFIG_SYSFS */ 323#endif /* CONFIG_SYSFS */
325 324
326/*
327 * Set up the most two significant bit to notify mce log that this thermal
328 * event type.
329 * This is a temp solution. May be changed in the future with mce log
330 * infrasture.
331 */
332#define CORE_THROTTLED (0)
333#define CORE_POWER_LIMIT ((__u64)1 << 62)
334#define PACKAGE_THROTTLED ((__u64)2 << 62)
335#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
336
337static void notify_thresholds(__u64 msr_val) 325static void notify_thresholds(__u64 msr_val)
338{ 326{
339 /* check whether the interrupt handler is defined; 327 /* check whether the interrupt handler is defined;
@@ -363,27 +351,23 @@ static void intel_thermal_interrupt(void)
363 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 351 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
364 THERMAL_THROTTLING_EVENT, 352 THERMAL_THROTTLING_EVENT,
365 CORE_LEVEL) != 0) 353 CORE_LEVEL) != 0)
366 mce_log_therm_throt_event(CORE_THROTTLED | msr_val); 354 mce_log_therm_throt_event(msr_val);
367 355
368 if (this_cpu_has(X86_FEATURE_PLN)) 356 if (this_cpu_has(X86_FEATURE_PLN))
369 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 357 therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
370 POWER_LIMIT_EVENT, 358 POWER_LIMIT_EVENT,
371 CORE_LEVEL) != 0) 359 CORE_LEVEL);
372 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
373 360
374 if (this_cpu_has(X86_FEATURE_PTS)) { 361 if (this_cpu_has(X86_FEATURE_PTS)) {
375 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 362 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
376 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 363 therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
377 THERMAL_THROTTLING_EVENT, 364 THERMAL_THROTTLING_EVENT,
378 PACKAGE_LEVEL) != 0) 365 PACKAGE_LEVEL);
379 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
380 if (this_cpu_has(X86_FEATURE_PLN)) 366 if (this_cpu_has(X86_FEATURE_PLN))
381 if (therm_throt_process(msr_val & 367 therm_throt_process(msr_val &
382 PACKAGE_THERM_STATUS_POWER_LIMIT, 368 PACKAGE_THERM_STATUS_POWER_LIMIT,
383 POWER_LIMIT_EVENT, 369 POWER_LIMIT_EVENT,
384 PACKAGE_LEVEL) != 0) 370 PACKAGE_LEVEL);
385 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
386 | msr_val);
387 } 371 }
388} 372}
389 373
@@ -397,8 +381,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
397 381
398asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) 382asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
399{ 383{
400 exit_idle();
401 irq_enter(); 384 irq_enter();
385 exit_idle();
402 inc_irq_stat(irq_thermal_count); 386 inc_irq_stat(irq_thermal_count);
403 smp_thermal_vector(); 387 smp_thermal_vector();
404 irq_exit(); 388 irq_exit();
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index d746df2909c9..aa578cadb940 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void smp_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle();
23 irq_enter(); 22 irq_enter();
23 exit_idle();
24 inc_irq_stat(irq_threshold_count); 24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector(); 25 mce_threshold_vector();
26 irq_exit(); 26 irq_exit();
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index a71efcdbb092..97b26356e9ee 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -547,6 +547,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
547 547
548 if (tmp != mask_lo) { 548 if (tmp != mask_lo) {
549 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); 549 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
550 add_taint(TAINT_FIRMWARE_WORKAROUND);
550 mask_lo = tmp; 551 mask_lo = tmp;
551 } 552 }
552 } 553 }
@@ -693,6 +694,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
693 694
694 /* Disable MTRRs, and set the default type to uncached */ 695 /* Disable MTRRs, and set the default type to uncached */
695 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); 696 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
697 wbinvd();
696} 698}
697 699
698static void post_set(void) __releases(set_atomicity_lock) 700static void post_set(void) __releases(set_atomicity_lock)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 640891014b2a..5adce1040b11 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -312,12 +312,8 @@ int x86_setup_perfctr(struct perf_event *event)
312 return -EOPNOTSUPP; 312 return -EOPNOTSUPP;
313 } 313 }
314 314
315 /*
316 * Do not allow config1 (extended registers) to propagate,
317 * there's no sane user-space generalization yet:
318 */
319 if (attr->type == PERF_TYPE_RAW) 315 if (attr->type == PERF_TYPE_RAW)
320 return 0; 316 return x86_pmu_extra_regs(event->attr.config, event);
321 317
322 if (attr->type == PERF_TYPE_HW_CACHE) 318 if (attr->type == PERF_TYPE_HW_CACHE)
323 return set_ext_hw_attr(hwc, event); 319 return set_ext_hw_attr(hwc, event);
@@ -488,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event)
488 return event->pmu == &pmu; 484 return event->pmu == &pmu;
489} 485}
490 486
487/*
488 * Event scheduler state:
489 *
490 * Assign events iterating over all events and counters, beginning
491 * with events with least weights first. Keep the current iterator
492 * state in struct sched_state.
493 */
494struct sched_state {
495 int weight;
496 int event; /* event index */
497 int counter; /* counter index */
498 int unassigned; /* number of events to be assigned left */
499 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
500};
501
502/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
503#define SCHED_STATES_MAX 2
504
505struct perf_sched {
506 int max_weight;
507 int max_events;
508 struct event_constraint **constraints;
509 struct sched_state state;
510 int saved_states;
511 struct sched_state saved[SCHED_STATES_MAX];
512};
513
514/*
515 * Initialize interator that runs through all events and counters.
516 */
517static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
518 int num, int wmin, int wmax)
519{
520 int idx;
521
522 memset(sched, 0, sizeof(*sched));
523 sched->max_events = num;
524 sched->max_weight = wmax;
525 sched->constraints = c;
526
527 for (idx = 0; idx < num; idx++) {
528 if (c[idx]->weight == wmin)
529 break;
530 }
531
532 sched->state.event = idx; /* start with min weight */
533 sched->state.weight = wmin;
534 sched->state.unassigned = num;
535}
536
537static void perf_sched_save_state(struct perf_sched *sched)
538{
539 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
540 return;
541
542 sched->saved[sched->saved_states] = sched->state;
543 sched->saved_states++;
544}
545
546static bool perf_sched_restore_state(struct perf_sched *sched)
547{
548 if (!sched->saved_states)
549 return false;
550
551 sched->saved_states--;
552 sched->state = sched->saved[sched->saved_states];
553
554 /* continue with next counter: */
555 clear_bit(sched->state.counter++, sched->state.used);
556
557 return true;
558}
559
560/*
561 * Select a counter for the current event to schedule. Return true on
562 * success.
563 */
564static bool __perf_sched_find_counter(struct perf_sched *sched)
565{
566 struct event_constraint *c;
567 int idx;
568
569 if (!sched->state.unassigned)
570 return false;
571
572 if (sched->state.event >= sched->max_events)
573 return false;
574
575 c = sched->constraints[sched->state.event];
576
577 /* Prefer fixed purpose counters */
578 if (x86_pmu.num_counters_fixed) {
579 idx = X86_PMC_IDX_FIXED;
580 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
581 if (!__test_and_set_bit(idx, sched->state.used))
582 goto done;
583 }
584 }
585 /* Grab the first unused counter starting with idx */
586 idx = sched->state.counter;
587 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
588 if (!__test_and_set_bit(idx, sched->state.used))
589 goto done;
590 }
591
592 return false;
593
594done:
595 sched->state.counter = idx;
596
597 if (c->overlap)
598 perf_sched_save_state(sched);
599
600 return true;
601}
602
603static bool perf_sched_find_counter(struct perf_sched *sched)
604{
605 while (!__perf_sched_find_counter(sched)) {
606 if (!perf_sched_restore_state(sched))
607 return false;
608 }
609
610 return true;
611}
612
613/*
614 * Go through all unassigned events and find the next one to schedule.
615 * Take events with the least weight first. Return true on success.
616 */
617static bool perf_sched_next_event(struct perf_sched *sched)
618{
619 struct event_constraint *c;
620
621 if (!sched->state.unassigned || !--sched->state.unassigned)
622 return false;
623
624 do {
625 /* next event */
626 sched->state.event++;
627 if (sched->state.event >= sched->max_events) {
628 /* next weight */
629 sched->state.event = 0;
630 sched->state.weight++;
631 if (sched->state.weight > sched->max_weight)
632 return false;
633 }
634 c = sched->constraints[sched->state.event];
635 } while (c->weight != sched->state.weight);
636
637 sched->state.counter = 0; /* start with first counter */
638
639 return true;
640}
641
642/*
643 * Assign a counter for each event.
644 */
645static int perf_assign_events(struct event_constraint **constraints, int n,
646 int wmin, int wmax, int *assign)
647{
648 struct perf_sched sched;
649
650 perf_sched_init(&sched, constraints, n, wmin, wmax);
651
652 do {
653 if (!perf_sched_find_counter(&sched))
654 break; /* failed */
655 if (assign)
656 assign[sched.state.event] = sched.state.counter;
657 } while (perf_sched_next_event(&sched));
658
659 return sched.state.unassigned;
660}
661
491int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 662int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
492{ 663{
493 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; 664 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
494 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 665 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
495 int i, j, w, wmax, num = 0; 666 int i, wmin, wmax, num = 0;
496 struct hw_perf_event *hwc; 667 struct hw_perf_event *hwc;
497 668
498 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 669 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
499 670
500 for (i = 0; i < n; i++) { 671 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
501 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); 672 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
502 constraints[i] = c; 673 constraints[i] = c;
674 wmin = min(wmin, c->weight);
675 wmax = max(wmax, c->weight);
503 } 676 }
504 677
505 /* 678 /*
@@ -525,59 +698,11 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
525 if (assign) 698 if (assign)
526 assign[i] = hwc->idx; 699 assign[i] = hwc->idx;
527 } 700 }
528 if (i == n)
529 goto done;
530
531 /*
532 * begin slow path
533 */
534
535 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
536
537 /*
538 * weight = number of possible counters
539 *
540 * 1 = most constrained, only works on one counter
541 * wmax = least constrained, works on any counter
542 *
543 * assign events to counters starting with most
544 * constrained events.
545 */
546 wmax = x86_pmu.num_counters;
547
548 /*
549 * when fixed event counters are present,
550 * wmax is incremented by 1 to account
551 * for one more choice
552 */
553 if (x86_pmu.num_counters_fixed)
554 wmax++;
555
556 for (w = 1, num = n; num && w <= wmax; w++) {
557 /* for each event */
558 for (i = 0; num && i < n; i++) {
559 c = constraints[i];
560 hwc = &cpuc->event_list[i]->hw;
561
562 if (c->weight != w)
563 continue;
564 701
565 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { 702 /* slow path */
566 if (!test_bit(j, used_mask)) 703 if (i != n)
567 break; 704 num = perf_assign_events(constraints, n, wmin, wmax, assign);
568 }
569
570 if (j == X86_PMC_IDX_MAX)
571 break;
572 705
573 __set_bit(j, used_mask);
574
575 if (assign)
576 assign[i] = j;
577 num--;
578 }
579 }
580done:
581 /* 706 /*
582 * scheduling failed or is just a simulation, 707 * scheduling failed or is just a simulation,
583 * free resources if necessary 708 * free resources if necessary
@@ -588,7 +713,7 @@ done:
588 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); 713 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
589 } 714 }
590 } 715 }
591 return num ? -ENOSPC : 0; 716 return num ? -EINVAL : 0;
592} 717}
593 718
594/* 719/*
@@ -607,7 +732,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
607 732
608 if (is_x86_event(leader)) { 733 if (is_x86_event(leader)) {
609 if (n >= max_count) 734 if (n >= max_count)
610 return -ENOSPC; 735 return -EINVAL;
611 cpuc->event_list[n] = leader; 736 cpuc->event_list[n] = leader;
612 n++; 737 n++;
613 } 738 }
@@ -620,7 +745,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
620 continue; 745 continue;
621 746
622 if (n >= max_count) 747 if (n >= max_count)
623 return -ENOSPC; 748 return -EINVAL;
624 749
625 cpuc->event_list[n] = event; 750 cpuc->event_list[n] = event;
626 n++; 751 n++;
@@ -1123,6 +1248,7 @@ static void __init pmu_check_apic(void)
1123 1248
1124static int __init init_hw_perf_events(void) 1249static int __init init_hw_perf_events(void)
1125{ 1250{
1251 struct x86_pmu_quirk *quirk;
1126 struct event_constraint *c; 1252 struct event_constraint *c;
1127 int err; 1253 int err;
1128 1254
@@ -1151,8 +1277,8 @@ static int __init init_hw_perf_events(void)
1151 1277
1152 pr_cont("%s PMU driver.\n", x86_pmu.name); 1278 pr_cont("%s PMU driver.\n", x86_pmu.name);
1153 1279
1154 if (x86_pmu.quirks) 1280 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1155 x86_pmu.quirks(); 1281 quirk->func();
1156 1282
1157 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1283 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1158 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1284 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
@@ -1175,12 +1301,18 @@ static int __init init_hw_perf_events(void)
1175 1301
1176 unconstrained = (struct event_constraint) 1302 unconstrained = (struct event_constraint)
1177 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1303 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1178 0, x86_pmu.num_counters); 1304 0, x86_pmu.num_counters, 0);
1179 1305
1180 if (x86_pmu.event_constraints) { 1306 if (x86_pmu.event_constraints) {
1307 /*
1308 * event on fixed counter2 (REF_CYCLES) only works on this
1309 * counter, so do not extend mask to generic counters
1310 */
1181 for_each_event_constraint(c, x86_pmu.event_constraints) { 1311 for_each_event_constraint(c, x86_pmu.event_constraints) {
1182 if (c->cmask != X86_RAW_EVENT_MASK) 1312 if (c->cmask != X86_RAW_EVENT_MASK
1313 || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
1183 continue; 1314 continue;
1315 }
1184 1316
1185 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; 1317 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1186 c->weight += x86_pmu.num_counters; 1318 c->weight += x86_pmu.num_counters;
@@ -1316,7 +1448,7 @@ static int validate_event(struct perf_event *event)
1316 c = x86_pmu.get_event_constraints(fake_cpuc, event); 1448 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1317 1449
1318 if (!c || !c->weight) 1450 if (!c || !c->weight)
1319 ret = -ENOSPC; 1451 ret = -EINVAL;
1320 1452
1321 if (x86_pmu.put_event_constraints) 1453 if (x86_pmu.put_event_constraints)
1322 x86_pmu.put_event_constraints(fake_cpuc, event); 1454 x86_pmu.put_event_constraints(fake_cpuc, event);
@@ -1341,7 +1473,7 @@ static int validate_group(struct perf_event *event)
1341{ 1473{
1342 struct perf_event *leader = event->group_leader; 1474 struct perf_event *leader = event->group_leader;
1343 struct cpu_hw_events *fake_cpuc; 1475 struct cpu_hw_events *fake_cpuc;
1344 int ret = -ENOSPC, n; 1476 int ret = -EINVAL, n;
1345 1477
1346 fake_cpuc = allocate_fake_cpuc(); 1478 fake_cpuc = allocate_fake_cpuc();
1347 if (IS_ERR(fake_cpuc)) 1479 if (IS_ERR(fake_cpuc))
@@ -1570,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
1570 1702
1571 return misc; 1703 return misc;
1572} 1704}
1705
1706void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
1707{
1708 cap->version = x86_pmu.version;
1709 cap->num_counters_gp = x86_pmu.num_counters;
1710 cap->num_counters_fixed = x86_pmu.num_counters_fixed;
1711 cap->bit_width_gp = x86_pmu.cntval_bits;
1712 cap->bit_width_fixed = x86_pmu.cntval_bits;
1713 cap->events_mask = (unsigned int)x86_pmu.events_maskl;
1714 cap->events_mask_len = x86_pmu.events_mask_len;
1715}
1716EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index b9698d40ac4b..8944062f46e2 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -45,6 +45,7 @@ struct event_constraint {
45 u64 code; 45 u64 code;
46 u64 cmask; 46 u64 cmask;
47 int weight; 47 int weight;
48 int overlap;
48}; 49};
49 50
50struct amd_nb { 51struct amd_nb {
@@ -151,15 +152,40 @@ struct cpu_hw_events {
151 void *kfree_on_online; 152 void *kfree_on_online;
152}; 153};
153 154
154#define __EVENT_CONSTRAINT(c, n, m, w) {\ 155#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
155 { .idxmsk64 = (n) }, \ 156 { .idxmsk64 = (n) }, \
156 .code = (c), \ 157 .code = (c), \
157 .cmask = (m), \ 158 .cmask = (m), \
158 .weight = (w), \ 159 .weight = (w), \
160 .overlap = (o), \
159} 161}
160 162
161#define EVENT_CONSTRAINT(c, n, m) \ 163#define EVENT_CONSTRAINT(c, n, m) \
162 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 164 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
165
166/*
167 * The overlap flag marks event constraints with overlapping counter
168 * masks. This is the case if the counter mask of such an event is not
169 * a subset of any other counter mask of a constraint with an equal or
170 * higher weight, e.g.:
171 *
172 * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
173 * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
174 * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
175 *
176 * The event scheduler may not select the correct counter in the first
177 * cycle because it needs to know which subsequent events will be
178 * scheduled. It may fail to schedule the events then. So we set the
179 * overlap flag for such constraints to give the scheduler a hint which
180 * events to select for counter rescheduling.
181 *
182 * Care must be taken as the rescheduling algorithm is O(n!) which
183 * will increase scheduling cycles for an over-commited system
184 * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
185 * and its counter masks must be kept at a minimum.
186 */
187#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
188 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
163 189
164/* 190/*
165 * Constraint on the Event code. 191 * Constraint on the Event code.
@@ -235,6 +261,11 @@ union perf_capabilities {
235 u64 capabilities; 261 u64 capabilities;
236}; 262};
237 263
264struct x86_pmu_quirk {
265 struct x86_pmu_quirk *next;
266 void (*func)(void);
267};
268
238/* 269/*
239 * struct x86_pmu - generic x86 pmu 270 * struct x86_pmu - generic x86 pmu
240 */ 271 */
@@ -259,6 +290,11 @@ struct x86_pmu {
259 int num_counters_fixed; 290 int num_counters_fixed;
260 int cntval_bits; 291 int cntval_bits;
261 u64 cntval_mask; 292 u64 cntval_mask;
293 union {
294 unsigned long events_maskl;
295 unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
296 };
297 int events_mask_len;
262 int apic; 298 int apic;
263 u64 max_period; 299 u64 max_period;
264 struct event_constraint * 300 struct event_constraint *
@@ -268,7 +304,7 @@ struct x86_pmu {
268 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 304 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
269 struct perf_event *event); 305 struct perf_event *event);
270 struct event_constraint *event_constraints; 306 struct event_constraint *event_constraints;
271 void (*quirks)(void); 307 struct x86_pmu_quirk *quirks;
272 int perfctr_second_write; 308 int perfctr_second_write;
273 309
274 int (*cpu_prepare)(int cpu); 310 int (*cpu_prepare)(int cpu);
@@ -309,6 +345,15 @@ struct x86_pmu {
309 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); 345 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
310}; 346};
311 347
348#define x86_add_quirk(func_) \
349do { \
350 static struct x86_pmu_quirk __quirk __initdata = { \
351 .func = func_, \
352 }; \
353 __quirk.next = x86_pmu.quirks; \
354 x86_pmu.quirks = &__quirk; \
355} while (0)
356
312#define ERF_NO_HT_SHARING 1 357#define ERF_NO_HT_SHARING 1
313#define ERF_HAS_RSP_1 2 358#define ERF_HAS_RSP_1 2
314 359
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index aeefd45697a2..0397b23be8e9 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = {
492static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); 492static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
493static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); 493static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
494static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); 494static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
495static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); 495static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
496static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); 496static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
497static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); 497static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
498 498
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index ab6343d21825..3b8a2d30d14e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -199,8 +199,7 @@ static int force_ibs_eilvt_setup(void)
199 goto out; 199 goto out;
200 } 200 }
201 201
202 pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); 202 pr_info("IBS: LVT offset %d assigned\n", offset);
203 pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
204 203
205 return 0; 204 return 0;
206out: 205out:
@@ -265,19 +264,23 @@ perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *h
265static __init int amd_ibs_init(void) 264static __init int amd_ibs_init(void)
266{ 265{
267 u32 caps; 266 u32 caps;
268 int ret; 267 int ret = -EINVAL;
269 268
270 caps = __get_ibs_caps(); 269 caps = __get_ibs_caps();
271 if (!caps) 270 if (!caps)
272 return -ENODEV; /* ibs not supported by the cpu */ 271 return -ENODEV; /* ibs not supported by the cpu */
273 272
274 if (!ibs_eilvt_valid()) { 273 /*
275 ret = force_ibs_eilvt_setup(); 274 * Force LVT offset assignment for family 10h: The offsets are
276 if (ret) { 275 * not assigned by the BIOS for this family, so the OS is
277 pr_err("Failed to setup IBS, %d\n", ret); 276 * responsible for doing it. If the OS assignment fails, fall
278 return ret; 277 * back to BIOS settings and try to setup this.
279 } 278 */
280 } 279 if (boot_cpu_data.x86 == 0x10)
280 force_ibs_eilvt_setup();
281
282 if (!ibs_eilvt_valid())
283 goto out;
281 284
282 get_online_cpus(); 285 get_online_cpus();
283 ibs_caps = caps; 286 ibs_caps = caps;
@@ -287,7 +290,11 @@ static __init int amd_ibs_init(void)
287 smp_call_function(setup_APIC_ibs, NULL, 1); 290 smp_call_function(setup_APIC_ibs, NULL, 1);
288 put_online_cpus(); 291 put_online_cpus();
289 292
290 return perf_event_ibs_init(); 293 ret = perf_event_ibs_init();
294out:
295 if (ret)
296 pr_err("Failed to setup IBS, %d\n", ret);
297 return ret;
291} 298}
292 299
293/* Since we need the pci subsystem to init ibs we can't do this earlier: */ 300/* Since we need the pci subsystem to init ibs we can't do this earlier: */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2be5ebe99872..3bd37bdf1b8e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
31 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
31}; 32};
32 33
33static struct event_constraint intel_core_event_constraints[] __read_mostly = 34static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =
45{ 46{
46 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 47 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
47 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 48 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
48 /* 49 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
49 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
50 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
51 * ratio between these counters.
52 */
53 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
54 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ 50 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
55 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 51 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
56 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 52 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
68{ 64{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 65 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 66 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 67 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ 68 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
73 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ 69 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
74 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ 70 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
@@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly
90{ 86{
91 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 87 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
92 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 88 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
93 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 89 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
94 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ 90 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
95 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ 91 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
96 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ 92 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
@@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
102{ 98{
103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 99 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 100 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
105 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 101 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
106 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ 102 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
107 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 103 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
108 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 104 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
125{ 121{
126 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 122 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
127 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 123 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
128 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 124 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
129 EVENT_CONSTRAINT_END 125 EVENT_CONSTRAINT_END
130}; 126};
131 127
@@ -1169,7 +1165,7 @@ again:
1169 */ 1165 */
1170 c = &unconstrained; 1166 c = &unconstrained;
1171 } else if (intel_try_alt_er(event, orig_idx)) { 1167 } else if (intel_try_alt_er(event, orig_idx)) {
1172 raw_spin_unlock(&era->lock); 1168 raw_spin_unlock_irqrestore(&era->lock, flags);
1173 goto again; 1169 goto again;
1174 } 1170 }
1175 raw_spin_unlock_irqrestore(&era->lock, flags); 1171 raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = {
1519 .guest_get_msrs = intel_guest_get_msrs, 1515 .guest_get_msrs = intel_guest_get_msrs,
1520}; 1516};
1521 1517
1522static void intel_clovertown_quirks(void) 1518static __init void intel_clovertown_quirk(void)
1523{ 1519{
1524 /* 1520 /*
1525 * PEBS is unreliable due to: 1521 * PEBS is unreliable due to:
@@ -1545,12 +1541,60 @@ static void intel_clovertown_quirks(void)
1545 x86_pmu.pebs_constraints = NULL; 1541 x86_pmu.pebs_constraints = NULL;
1546} 1542}
1547 1543
1544static __init void intel_sandybridge_quirk(void)
1545{
1546 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
1547 x86_pmu.pebs = 0;
1548 x86_pmu.pebs_constraints = NULL;
1549}
1550
1551static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
1552 { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
1553 { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
1554 { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
1555 { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
1556 { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
1557 { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
1558 { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
1559};
1560
1561static __init void intel_arch_events_quirk(void)
1562{
1563 int bit;
1564
1565 /* disable event that reported as not presend by cpuid */
1566 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
1567 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
1568 printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n",
1569 intel_arch_events_map[bit].name);
1570 }
1571}
1572
1573static __init void intel_nehalem_quirk(void)
1574{
1575 union cpuid10_ebx ebx;
1576
1577 ebx.full = x86_pmu.events_maskl;
1578 if (ebx.split.no_branch_misses_retired) {
1579 /*
1580 * Erratum AAJ80 detected, we work it around by using
1581 * the BR_MISP_EXEC.ANY event. This will over-count
1582 * branch-misses, but it's still much better than the
1583 * architectural event which is often completely bogus:
1584 */
1585 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1586 ebx.split.no_branch_misses_retired = 0;
1587 x86_pmu.events_maskl = ebx.full;
1588 printk(KERN_INFO "CPU erratum AAJ80 worked around\n");
1589 }
1590}
1591
1548__init int intel_pmu_init(void) 1592__init int intel_pmu_init(void)
1549{ 1593{
1550 union cpuid10_edx edx; 1594 union cpuid10_edx edx;
1551 union cpuid10_eax eax; 1595 union cpuid10_eax eax;
1596 union cpuid10_ebx ebx;
1552 unsigned int unused; 1597 unsigned int unused;
1553 unsigned int ebx;
1554 int version; 1598 int version;
1555 1599
1556 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 1600 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -1567,8 +1611,8 @@ __init int intel_pmu_init(void)
1567 * Check whether the Architectural PerfMon supports 1611 * Check whether the Architectural PerfMon supports
1568 * Branch Misses Retired hw_event or not. 1612 * Branch Misses Retired hw_event or not.
1569 */ 1613 */
1570 cpuid(10, &eax.full, &ebx, &unused, &edx.full); 1614 cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
1571 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) 1615 if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
1572 return -ENODEV; 1616 return -ENODEV;
1573 1617
1574 version = eax.split.version_id; 1618 version = eax.split.version_id;
@@ -1582,6 +1626,9 @@ __init int intel_pmu_init(void)
1582 x86_pmu.cntval_bits = eax.split.bit_width; 1626 x86_pmu.cntval_bits = eax.split.bit_width;
1583 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; 1627 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
1584 1628
1629 x86_pmu.events_maskl = ebx.full;
1630 x86_pmu.events_mask_len = eax.split.mask_length;
1631
1585 /* 1632 /*
1586 * Quirk: v2 perfmon does not report fixed-purpose events, so 1633 * Quirk: v2 perfmon does not report fixed-purpose events, so
1587 * assume at least 3 events: 1634 * assume at least 3 events:
@@ -1601,6 +1648,8 @@ __init int intel_pmu_init(void)
1601 1648
1602 intel_ds_init(); 1649 intel_ds_init();
1603 1650
1651 x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
1652
1604 /* 1653 /*
1605 * Install the hw-cache-events table: 1654 * Install the hw-cache-events table:
1606 */ 1655 */
@@ -1610,7 +1659,7 @@ __init int intel_pmu_init(void)
1610 break; 1659 break;
1611 1660
1612 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 1661 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1613 x86_pmu.quirks = intel_clovertown_quirks; 1662 x86_add_quirk(intel_clovertown_quirk);
1614 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 1663 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1615 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 1664 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1616 case 29: /* six-core 45 nm xeon "Dunnington" */ 1665 case 29: /* six-core 45 nm xeon "Dunnington" */
@@ -1644,17 +1693,8 @@ __init int intel_pmu_init(void)
1644 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1693 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1645 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1694 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1646 1695
1647 if (ebx & 0x40) { 1696 x86_add_quirk(intel_nehalem_quirk);
1648 /*
1649 * Erratum AAJ80 detected, we work it around by using
1650 * the BR_MISP_EXEC.ANY event. This will over-count
1651 * branch-misses, but it's still much better than the
1652 * architectural event which is often completely bogus:
1653 */
1654 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1655 1697
1656 pr_cont("erratum AAJ80 worked around, ");
1657 }
1658 pr_cont("Nehalem events, "); 1698 pr_cont("Nehalem events, ");
1659 break; 1699 break;
1660 1700
@@ -1694,6 +1734,7 @@ __init int intel_pmu_init(void)
1694 break; 1734 break;
1695 1735
1696 case 42: /* SandyBridge */ 1736 case 42: /* SandyBridge */
1737 x86_add_quirk(intel_sandybridge_quirk);
1697 case 45: /* SandyBridge, "Romely-EP" */ 1738 case 45: /* SandyBridge, "Romely-EP" */
1698 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1739 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1699 sizeof(hw_cache_event_ids)); 1740 sizeof(hw_cache_event_ids));
@@ -1730,5 +1771,6 @@ __init int intel_pmu_init(void)
1730 break; 1771 break;
1731 } 1772 }
1732 } 1773 }
1774
1733 return 0; 1775 return 0;
1734} 1776}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index c0d238f49db8..73da6b64f5b7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -493,6 +493,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
493 unsigned long from = cpuc->lbr_entries[0].from; 493 unsigned long from = cpuc->lbr_entries[0].from;
494 unsigned long old_to, to = cpuc->lbr_entries[0].to; 494 unsigned long old_to, to = cpuc->lbr_entries[0].to;
495 unsigned long ip = regs->ip; 495 unsigned long ip = regs->ip;
496 int is_64bit = 0;
496 497
497 /* 498 /*
498 * We don't need to fixup if the PEBS assist is fault like 499 * We don't need to fixup if the PEBS assist is fault like
@@ -544,7 +545,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
544 } else 545 } else
545 kaddr = (void *)to; 546 kaddr = (void *)to;
546 547
547 kernel_insn_init(&insn, kaddr); 548#ifdef CONFIG_X86_64
549 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
550#endif
551 insn_init(&insn, kaddr, is_64bit);
548 insn_get_length(&insn); 552 insn_get_length(&insn);
549 to += insn.length; 553 to += insn.length;
550 } while (to < ip); 554 } while (to < ip);
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 492bf1358a7c..ef484d9d0a25 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1268,7 +1268,7 @@ reserve:
1268 } 1268 }
1269 1269
1270done: 1270done:
1271 return num ? -ENOSPC : 0; 1271 return num ? -EINVAL : 0;
1272} 1272}
1273 1273
1274static __initconst const struct x86_pmu p4_pmu = { 1274static __initconst const struct x86_pmu p4_pmu = {
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 5abbea297e0c..7b3fe56b1c21 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {
16 "100mhzsteps", 16 "100mhzsteps",
17 "hwpstate", 17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */ 18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */ 19 "cpb", /* core performance boost */
20 "eff_freq_ro", /* Readonly aperf/mperf */
20}; 21};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 14b23140e81f..8022c6681485 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
64static int show_cpuinfo(struct seq_file *m, void *v) 64static int show_cpuinfo(struct seq_file *m, void *v)
65{ 65{
66 struct cpuinfo_x86 *c = v; 66 struct cpuinfo_x86 *c = v;
67 unsigned int cpu = 0; 67 unsigned int cpu;
68 int i; 68 int i;
69 69
70#ifdef CONFIG_SMP
71 cpu = c->cpu_index; 70 cpu = c->cpu_index;
72#endif
73 seq_printf(m, "processor\t: %u\n" 71 seq_printf(m, "processor\t: %u\n"
74 "vendor_id\t: %s\n" 72 "vendor_id\t: %s\n"
75 "cpu family\t: %d\n" 73 "cpu family\t: %d\n"
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 212a6a42527c..a524353d93f2 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
177 .notifier_call = cpuid_class_cpu_callback, 177 .notifier_call = cpuid_class_cpu_callback,
178}; 178};
179 179
180static char *cpuid_devnode(struct device *dev, mode_t *mode) 180static char *cpuid_devnode(struct device *dev, umode_t *mode)
181{ 181{
182 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); 182 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
183} 183}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 3b97a80ce329..c99f9ed013d5 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -116,16 +116,16 @@ void show_registers(struct pt_regs *regs)
116 for (i = 0; i < code_len; i++, ip++) { 116 for (i = 0; i < code_len; i++, ip++) {
117 if (ip < (u8 *)PAGE_OFFSET || 117 if (ip < (u8 *)PAGE_OFFSET ||
118 probe_kernel_address(ip, c)) { 118 probe_kernel_address(ip, c)) {
119 printk(" Bad EIP value."); 119 printk(KERN_CONT " Bad EIP value.");
120 break; 120 break;
121 } 121 }
122 if (ip == (u8 *)regs->ip) 122 if (ip == (u8 *)regs->ip)
123 printk("<%02x> ", c); 123 printk(KERN_CONT "<%02x> ", c);
124 else 124 else
125 printk("%02x ", c); 125 printk(KERN_CONT "%02x ", c);
126 } 126 }
127 } 127 }
128 printk("\n"); 128 printk(KERN_CONT "\n");
129} 129}
130 130
131int is_valid_bugaddr(unsigned long ip) 131int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 19853ad8afc5..6d728d9284bd 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -284,16 +284,16 @@ void show_registers(struct pt_regs *regs)
284 for (i = 0; i < code_len; i++, ip++) { 284 for (i = 0; i < code_len; i++, ip++) {
285 if (ip < (u8 *)PAGE_OFFSET || 285 if (ip < (u8 *)PAGE_OFFSET ||
286 probe_kernel_address(ip, c)) { 286 probe_kernel_address(ip, c)) {
287 printk(" Bad RIP value."); 287 printk(KERN_CONT " Bad RIP value.");
288 break; 288 break;
289 } 289 }
290 if (ip == (u8 *)regs->ip) 290 if (ip == (u8 *)regs->ip)
291 printk("<%02x> ", c); 291 printk(KERN_CONT "<%02x> ", c);
292 else 292 else
293 printk("%02x ", c); 293 printk(KERN_CONT "%02x ", c);
294 } 294 }
295 } 295 }
296 printk("\n"); 296 printk(KERN_CONT "\n");
297} 297}
298 298
299int is_valid_bugaddr(unsigned long ip) 299int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 056e65d5012b..62d61e9976eb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
19#include <linux/acpi.h> 19#include <linux/acpi.h>
20#include <linux/firmware-map.h> 20#include <linux/firmware-map.h>
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22#include <linux/sort.h>
22 23
23#include <asm/e820.h> 24#include <asm/e820.h>
24#include <asm/proto.h> 25#include <asm/proto.h>
@@ -227,22 +228,38 @@ void __init e820_print_map(char *who)
227 * ____________________33__ 228 * ____________________33__
228 * ______________________4_ 229 * ______________________4_
229 */ 230 */
231struct change_member {
232 struct e820entry *pbios; /* pointer to original bios entry */
233 unsigned long long addr; /* address for this change point */
234};
235
236static int __init cpcompare(const void *a, const void *b)
237{
238 struct change_member * const *app = a, * const *bpp = b;
239 const struct change_member *ap = *app, *bp = *bpp;
240
241 /*
242 * Inputs are pointers to two elements of change_point[]. If their
243 * addresses are unequal, their difference dominates. If the addresses
244 * are equal, then consider one that represents the end of its region
245 * to be greater than one that does not.
246 */
247 if (ap->addr != bp->addr)
248 return ap->addr > bp->addr ? 1 : -1;
249
250 return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
251}
230 252
231int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, 253int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
232 u32 *pnr_map) 254 u32 *pnr_map)
233{ 255{
234 struct change_member {
235 struct e820entry *pbios; /* pointer to original bios entry */
236 unsigned long long addr; /* address for this change point */
237 };
238 static struct change_member change_point_list[2*E820_X_MAX] __initdata; 256 static struct change_member change_point_list[2*E820_X_MAX] __initdata;
239 static struct change_member *change_point[2*E820_X_MAX] __initdata; 257 static struct change_member *change_point[2*E820_X_MAX] __initdata;
240 static struct e820entry *overlap_list[E820_X_MAX] __initdata; 258 static struct e820entry *overlap_list[E820_X_MAX] __initdata;
241 static struct e820entry new_bios[E820_X_MAX] __initdata; 259 static struct e820entry new_bios[E820_X_MAX] __initdata;
242 struct change_member *change_tmp;
243 unsigned long current_type, last_type; 260 unsigned long current_type, last_type;
244 unsigned long long last_addr; 261 unsigned long long last_addr;
245 int chgidx, still_changing; 262 int chgidx;
246 int overlap_entries; 263 int overlap_entries;
247 int new_bios_entry; 264 int new_bios_entry;
248 int old_nr, new_nr, chg_nr; 265 int old_nr, new_nr, chg_nr;
@@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
279 chg_nr = chgidx; 296 chg_nr = chgidx;
280 297
281 /* sort change-point list by memory addresses (low -> high) */ 298 /* sort change-point list by memory addresses (low -> high) */
282 still_changing = 1; 299 sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
283 while (still_changing) {
284 still_changing = 0;
285 for (i = 1; i < chg_nr; i++) {
286 unsigned long long curaddr, lastaddr;
287 unsigned long long curpbaddr, lastpbaddr;
288
289 curaddr = change_point[i]->addr;
290 lastaddr = change_point[i - 1]->addr;
291 curpbaddr = change_point[i]->pbios->addr;
292 lastpbaddr = change_point[i - 1]->pbios->addr;
293
294 /*
295 * swap entries, when:
296 *
297 * curaddr > lastaddr or
298 * curaddr == lastaddr and curaddr == curpbaddr and
299 * lastaddr != lastpbaddr
300 */
301 if (curaddr < lastaddr ||
302 (curaddr == lastaddr && curaddr == curpbaddr &&
303 lastaddr != lastpbaddr)) {
304 change_tmp = change_point[i];
305 change_point[i] = change_point[i-1];
306 change_point[i-1] = change_tmp;
307 still_changing = 1;
308 }
309 }
310 }
311 300
312 /* create a new bios memory map, removing overlaps */ 301 /* create a new bios memory map, removing overlaps */
313 overlap_entries = 0; /* number of entries in the overlap table */ 302 overlap_entries = 0; /* number of entries in the overlap table */
@@ -714,7 +703,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
714} 703}
715#endif 704#endif
716 705
717#ifdef CONFIG_HIBERNATION 706#ifdef CONFIG_ACPI
718/** 707/**
719 * Mark ACPI NVS memory region, so that we can save/restore it during 708 * Mark ACPI NVS memory region, so that we can save/restore it during
720 * hibernation and the subsequent resume. 709 * hibernation and the subsequent resume.
@@ -727,7 +716,7 @@ static int __init e820_mark_nvs_memory(void)
727 struct e820entry *ei = &e820.map[i]; 716 struct e820entry *ei = &e820.map[i];
728 717
729 if (ei->type == E820_NVS) 718 if (ei->type == E820_NVS)
730 suspend_nvs_register(ei->addr, ei->size); 719 acpi_nvs_register(ei->addr, ei->size);
731 } 720 }
732 721
733 return 0; 722 return 0;
@@ -1072,7 +1061,7 @@ void __init memblock_x86_fill(void)
1072 * We are safe to enable resizing, beause memblock_x86_fill() 1061 * We are safe to enable resizing, beause memblock_x86_fill()
1073 * is rather later for x86 1062 * is rather later for x86
1074 */ 1063 */
1075 memblock_can_resize = 1; 1064 memblock_allow_resize();
1076 1065
1077 for (i = 0; i < e820.nr_map; i++) { 1066 for (i = 0; i < e820.nr_map; i++) {
1078 struct e820entry *ei = &e820.map[i]; 1067 struct e820entry *ei = &e820.map[i];
@@ -1087,7 +1076,6 @@ void __init memblock_x86_fill(void)
1087 memblock_add(ei->addr, ei->size); 1076 memblock_add(ei->addr, ei->size);
1088 } 1077 }
1089 1078
1090 memblock_analyze();
1091 memblock_dump_all(); 1079 memblock_dump_all();
1092} 1080}
1093 1081
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f9..9b9f18b49918 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,14 +240,14 @@ static int __init setup_early_printk(char *buf)
240 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
241 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
242#endif 242#endif
243#ifdef CONFIG_EARLY_PRINTK_MRST 243#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
244 if (!strncmp(buf, "mrst", 4)) { 244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init(); 245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep); 246 early_console_register(&early_mrst_console, keep);
247 } 247 }
248 248
249 if (!strncmp(buf, "hsu", 3)) { 249 if (!strncmp(buf, "hsu", 3)) {
250 hsu_early_console_init(); 250 hsu_early_console_init(buf + 3);
251 early_console_register(&early_hsu_console, keep); 251 early_console_register(&early_hsu_console, keep);
252 } 252 }
253#endif 253#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f3f6f5344001..79d97e68f042 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
42 */ 42 */
43 43
44#include <linux/linkage.h> 44#include <linux/linkage.h>
45#include <linux/err.h>
45#include <asm/thread_info.h> 46#include <asm/thread_info.h>
46#include <asm/irqflags.h> 47#include <asm/irqflags.h>
47#include <asm/errno.h> 48#include <asm/errno.h>
@@ -81,8 +82,6 @@
81 * enough to patch inline, increasing performance. 82 * enough to patch inline, increasing performance.
82 */ 83 */
83 84
84#define nr_syscalls ((syscall_table_size)/4)
85
86#ifdef CONFIG_PREEMPT 85#ifdef CONFIG_PREEMPT
87#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 86#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
88#else 87#else
@@ -423,7 +422,7 @@ sysenter_past_esp:
423 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 422 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
424 jnz sysenter_audit 423 jnz sysenter_audit
425sysenter_do_call: 424sysenter_do_call:
426 cmpl $(nr_syscalls), %eax 425 cmpl $(NR_syscalls), %eax
427 jae syscall_badsys 426 jae syscall_badsys
428 call *sys_call_table(,%eax,4) 427 call *sys_call_table(,%eax,4)
429 movl %eax,PT_EAX(%esp) 428 movl %eax,PT_EAX(%esp)
@@ -455,7 +454,7 @@ sysenter_audit:
455 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ 454 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
456 movl %eax,%edx /* 2nd arg: syscall number */ 455 movl %eax,%edx /* 2nd arg: syscall number */
457 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 456 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
458 call audit_syscall_entry 457 call __audit_syscall_entry
459 pushl_cfi %ebx 458 pushl_cfi %ebx
460 movl PT_EAX(%esp),%eax /* reload syscall number */ 459 movl PT_EAX(%esp),%eax /* reload syscall number */
461 jmp sysenter_do_call 460 jmp sysenter_do_call
@@ -466,11 +465,10 @@ sysexit_audit:
466 TRACE_IRQS_ON 465 TRACE_IRQS_ON
467 ENABLE_INTERRUPTS(CLBR_ANY) 466 ENABLE_INTERRUPTS(CLBR_ANY)
468 movl %eax,%edx /* second arg, syscall return value */ 467 movl %eax,%edx /* second arg, syscall return value */
469 cmpl $0,%eax /* is it < 0? */ 468 cmpl $-MAX_ERRNO,%eax /* is it an error ? */
470 setl %al /* 1 if so, 0 if not */ 469 setbe %al /* 1 if so, 0 if not */
471 movzbl %al,%eax /* zero-extend that */ 470 movzbl %al,%eax /* zero-extend that */
472 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 471 call __audit_syscall_exit
473 call audit_syscall_exit
474 DISABLE_INTERRUPTS(CLBR_ANY) 472 DISABLE_INTERRUPTS(CLBR_ANY)
475 TRACE_IRQS_OFF 473 TRACE_IRQS_OFF
476 movl TI_flags(%ebp), %ecx 474 movl TI_flags(%ebp), %ecx
@@ -504,7 +502,7 @@ ENTRY(system_call)
504 # system call tracing in operation / emulation 502 # system call tracing in operation / emulation
505 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 503 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
506 jnz syscall_trace_entry 504 jnz syscall_trace_entry
507 cmpl $(nr_syscalls), %eax 505 cmpl $(NR_syscalls), %eax
508 jae syscall_badsys 506 jae syscall_badsys
509syscall_call: 507syscall_call:
510 call *sys_call_table(,%eax,4) 508 call *sys_call_table(,%eax,4)
@@ -625,6 +623,8 @@ work_notifysig: # deal with pending signals and
625 movl %esp, %eax 623 movl %esp, %eax
626 jne work_notifysig_v86 # returning to kernel-space or 624 jne work_notifysig_v86 # returning to kernel-space or
627 # vm86-space 625 # vm86-space
626 TRACE_IRQS_ON
627 ENABLE_INTERRUPTS(CLBR_NONE)
628 xorl %edx, %edx 628 xorl %edx, %edx
629 call do_notify_resume 629 call do_notify_resume
630 jmp resume_userspace_sig 630 jmp resume_userspace_sig
@@ -638,6 +638,8 @@ work_notifysig_v86:
638#else 638#else
639 movl %esp, %eax 639 movl %esp, %eax
640#endif 640#endif
641 TRACE_IRQS_ON
642 ENABLE_INTERRUPTS(CLBR_NONE)
641 xorl %edx, %edx 643 xorl %edx, %edx
642 call do_notify_resume 644 call do_notify_resume
643 jmp resume_userspace_sig 645 jmp resume_userspace_sig
@@ -650,7 +652,7 @@ syscall_trace_entry:
650 movl %esp, %eax 652 movl %esp, %eax
651 call syscall_trace_enter 653 call syscall_trace_enter
652 /* What it returned is what we'll actually use. */ 654 /* What it returned is what we'll actually use. */
653 cmpl $(nr_syscalls), %eax 655 cmpl $(NR_syscalls), %eax
654 jnae syscall_call 656 jnae syscall_call
655 jmp syscall_exit 657 jmp syscall_exit
656END(syscall_trace_entry) 658END(syscall_trace_entry)
@@ -690,29 +692,28 @@ END(syscall_badsys)
690 * System calls that need a pt_regs pointer. 692 * System calls that need a pt_regs pointer.
691 */ 693 */
692#define PTREGSCALL0(name) \ 694#define PTREGSCALL0(name) \
693 ALIGN; \ 695ENTRY(ptregs_##name) ; \
694ptregs_##name: \
695 leal 4(%esp),%eax; \ 696 leal 4(%esp),%eax; \
696 jmp sys_##name; 697 jmp sys_##name; \
698ENDPROC(ptregs_##name)
697 699
698#define PTREGSCALL1(name) \ 700#define PTREGSCALL1(name) \
699 ALIGN; \ 701ENTRY(ptregs_##name) ; \
700ptregs_##name: \
701 leal 4(%esp),%edx; \ 702 leal 4(%esp),%edx; \
702 movl (PT_EBX+4)(%esp),%eax; \ 703 movl (PT_EBX+4)(%esp),%eax; \
703 jmp sys_##name; 704 jmp sys_##name; \
705ENDPROC(ptregs_##name)
704 706
705#define PTREGSCALL2(name) \ 707#define PTREGSCALL2(name) \
706 ALIGN; \ 708ENTRY(ptregs_##name) ; \
707ptregs_##name: \
708 leal 4(%esp),%ecx; \ 709 leal 4(%esp),%ecx; \
709 movl (PT_ECX+4)(%esp),%edx; \ 710 movl (PT_ECX+4)(%esp),%edx; \
710 movl (PT_EBX+4)(%esp),%eax; \ 711 movl (PT_EBX+4)(%esp),%eax; \
711 jmp sys_##name; 712 jmp sys_##name; \
713ENDPROC(ptregs_##name)
712 714
713#define PTREGSCALL3(name) \ 715#define PTREGSCALL3(name) \
714 ALIGN; \ 716ENTRY(ptregs_##name) ; \
715ptregs_##name: \
716 CFI_STARTPROC; \ 717 CFI_STARTPROC; \
717 leal 4(%esp),%eax; \ 718 leal 4(%esp),%eax; \
718 pushl_cfi %eax; \ 719 pushl_cfi %eax; \
@@ -737,8 +738,7 @@ PTREGSCALL2(vm86)
737PTREGSCALL1(vm86old) 738PTREGSCALL1(vm86old)
738 739
739/* Clone is an oddball. The 4th arg is in %edi */ 740/* Clone is an oddball. The 4th arg is in %edi */
740 ALIGN; 741ENTRY(ptregs_clone)
741ptregs_clone:
742 CFI_STARTPROC 742 CFI_STARTPROC
743 leal 4(%esp),%eax 743 leal 4(%esp),%eax
744 pushl_cfi %eax 744 pushl_cfi %eax
@@ -1209,11 +1209,6 @@ return_to_handler:
1209 jmp *%ecx 1209 jmp *%ecx
1210#endif 1210#endif
1211 1211
1212.section .rodata,"a"
1213#include "syscall_table_32.S"
1214
1215syscall_table_size=(.-sys_call_table)
1216
1217/* 1212/*
1218 * Some functions should be protected against kprobes 1213 * Some functions should be protected against kprobes
1219 */ 1214 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e74b0b..3fe8239fd8fb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
55#include <asm/paravirt.h> 55#include <asm/paravirt.h>
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <linux/err.h>
58 59
59/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 60/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
60#include <linux/elf-em.h> 61#include <linux/elf-em.h>
@@ -221,7 +222,7 @@ ENDPROC(native_usergs_sysret64)
221 /*CFI_REL_OFFSET ss,0*/ 222 /*CFI_REL_OFFSET ss,0*/
222 pushq_cfi %rax /* rsp */ 223 pushq_cfi %rax /* rsp */
223 CFI_REL_OFFSET rsp,0 224 CFI_REL_OFFSET rsp,0
224 pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ 225 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
225 /*CFI_REL_OFFSET rflags,0*/ 226 /*CFI_REL_OFFSET rflags,0*/
226 pushq_cfi $__KERNEL_CS /* cs */ 227 pushq_cfi $__KERNEL_CS /* cs */
227 /*CFI_REL_OFFSET cs,0*/ 228 /*CFI_REL_OFFSET cs,0*/
@@ -411,7 +412,7 @@ ENTRY(ret_from_fork)
411 RESTORE_REST 412 RESTORE_REST
412 413
413 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 414 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
414 je int_ret_from_sys_call 415 jz retint_restore_args
415 416
416 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET 417 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
417 jnz int_ret_from_sys_call 418 jnz int_ret_from_sys_call
@@ -465,7 +466,7 @@ ENTRY(system_call)
465 * after the swapgs, so that it can do the swapgs 466 * after the swapgs, so that it can do the swapgs
466 * for the guest and jump here on syscall. 467 * for the guest and jump here on syscall.
467 */ 468 */
468ENTRY(system_call_after_swapgs) 469GLOBAL(system_call_after_swapgs)
469 470
470 movq %rsp,PER_CPU_VAR(old_rsp) 471 movq %rsp,PER_CPU_VAR(old_rsp)
471 movq PER_CPU_VAR(kernel_stack),%rsp 472 movq PER_CPU_VAR(kernel_stack),%rsp
@@ -478,8 +479,7 @@ ENTRY(system_call_after_swapgs)
478 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 479 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
479 movq %rcx,RIP-ARGOFFSET(%rsp) 480 movq %rcx,RIP-ARGOFFSET(%rsp)
480 CFI_REL_OFFSET rip,RIP-ARGOFFSET 481 CFI_REL_OFFSET rip,RIP-ARGOFFSET
481 GET_THREAD_INFO(%rcx) 482 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
482 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
483 jnz tracesys 483 jnz tracesys
484system_call_fastpath: 484system_call_fastpath:
485 cmpq $__NR_syscall_max,%rax 485 cmpq $__NR_syscall_max,%rax
@@ -496,10 +496,9 @@ ret_from_sys_call:
496 /* edi: flagmask */ 496 /* edi: flagmask */
497sysret_check: 497sysret_check:
498 LOCKDEP_SYS_EXIT 498 LOCKDEP_SYS_EXIT
499 GET_THREAD_INFO(%rcx)
500 DISABLE_INTERRUPTS(CLBR_NONE) 499 DISABLE_INTERRUPTS(CLBR_NONE)
501 TRACE_IRQS_OFF 500 TRACE_IRQS_OFF
502 movl TI_flags(%rcx),%edx 501 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
503 andl %edi,%edx 502 andl %edi,%edx
504 jnz sysret_careful 503 jnz sysret_careful
505 CFI_REMEMBER_STATE 504 CFI_REMEMBER_STATE
@@ -550,7 +549,7 @@ badsys:
550#ifdef CONFIG_AUDITSYSCALL 549#ifdef CONFIG_AUDITSYSCALL
551 /* 550 /*
552 * Fast path for syscall audit without full syscall trace. 551 * Fast path for syscall audit without full syscall trace.
553 * We just call audit_syscall_entry() directly, and then 552 * We just call __audit_syscall_entry() directly, and then
554 * jump back to the normal fast path. 553 * jump back to the normal fast path.
555 */ 554 */
556auditsys: 555auditsys:
@@ -560,22 +559,21 @@ auditsys:
560 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ 559 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
561 movq %rax,%rsi /* 2nd arg: syscall number */ 560 movq %rax,%rsi /* 2nd arg: syscall number */
562 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ 561 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
563 call audit_syscall_entry 562 call __audit_syscall_entry
564 LOAD_ARGS 0 /* reload call-clobbered registers */ 563 LOAD_ARGS 0 /* reload call-clobbered registers */
565 jmp system_call_fastpath 564 jmp system_call_fastpath
566 565
567 /* 566 /*
568 * Return fast path for syscall audit. Call audit_syscall_exit() 567 * Return fast path for syscall audit. Call __audit_syscall_exit()
569 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT 568 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
570 * masked off. 569 * masked off.
571 */ 570 */
572sysret_audit: 571sysret_audit:
573 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ 572 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
574 cmpq $0,%rsi /* is it < 0? */ 573 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
575 setl %al /* 1 if so, 0 if not */ 574 setbe %al /* 1 if so, 0 if not */
576 movzbl %al,%edi /* zero-extend that into %edi */ 575 movzbl %al,%edi /* zero-extend that into %edi */
577 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 576 call __audit_syscall_exit
578 call audit_syscall_exit
579 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 577 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
580 jmp sysret_check 578 jmp sysret_check
581#endif /* CONFIG_AUDITSYSCALL */ 579#endif /* CONFIG_AUDITSYSCALL */
@@ -583,7 +581,7 @@ sysret_audit:
583 /* Do syscall tracing */ 581 /* Do syscall tracing */
584tracesys: 582tracesys:
585#ifdef CONFIG_AUDITSYSCALL 583#ifdef CONFIG_AUDITSYSCALL
586 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 584 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
587 jz auditsys 585 jz auditsys
588#endif 586#endif
589 SAVE_REST 587 SAVE_REST
@@ -612,8 +610,6 @@ tracesys:
612GLOBAL(int_ret_from_sys_call) 610GLOBAL(int_ret_from_sys_call)
613 DISABLE_INTERRUPTS(CLBR_NONE) 611 DISABLE_INTERRUPTS(CLBR_NONE)
614 TRACE_IRQS_OFF 612 TRACE_IRQS_OFF
615 testl $3,CS-ARGOFFSET(%rsp)
616 je retint_restore_args
617 movl $_TIF_ALLWORK_MASK,%edi 613 movl $_TIF_ALLWORK_MASK,%edi
618 /* edi: mask to check */ 614 /* edi: mask to check */
619GLOBAL(int_with_check) 615GLOBAL(int_with_check)
@@ -953,6 +949,7 @@ END(common_interrupt)
953ENTRY(\sym) 949ENTRY(\sym)
954 INTR_FRAME 950 INTR_FRAME
955 pushq_cfi $~(\num) 951 pushq_cfi $~(\num)
952.Lcommon_\sym:
956 interrupt \do_sym 953 interrupt \do_sym
957 jmp ret_from_intr 954 jmp ret_from_intr
958 CFI_ENDPROC 955 CFI_ENDPROC
@@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
976 x86_platform_ipi smp_x86_platform_ipi 973 x86_platform_ipi smp_x86_platform_ipi
977 974
978#ifdef CONFIG_SMP 975#ifdef CONFIG_SMP
979.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 976 ALIGN
977 INTR_FRAME
978.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
980 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 979 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
981.if NUM_INVALIDATE_TLB_VECTORS > \idx 980.if NUM_INVALIDATE_TLB_VECTORS > \idx
982apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ 981ENTRY(invalidate_interrupt\idx)
983 invalidate_interrupt\idx smp_invalidate_interrupt 982 pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
983 jmp .Lcommon_invalidate_interrupt0
984 CFI_ADJUST_CFA_OFFSET -8
985END(invalidate_interrupt\idx)
984.endif 986.endif
985.endr 987.endr
988 CFI_ENDPROC
989apicinterrupt INVALIDATE_TLB_VECTOR_START, \
990 invalidate_interrupt0, smp_invalidate_interrupt
986#endif 991#endif
987 992
988apicinterrupt THRESHOLD_APIC_VECTOR \ 993apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1475,62 +1480,214 @@ ENTRY(error_exit)
1475 CFI_ENDPROC 1480 CFI_ENDPROC
1476END(error_exit) 1481END(error_exit)
1477 1482
1483/*
1484 * Test if a given stack is an NMI stack or not.
1485 */
1486 .macro test_in_nmi reg stack nmi_ret normal_ret
1487 cmpq %\reg, \stack
1488 ja \normal_ret
1489 subq $EXCEPTION_STKSZ, %\reg
1490 cmpq %\reg, \stack
1491 jb \normal_ret
1492 jmp \nmi_ret
1493 .endm
1478 1494
1479 /* runs on exception stack */ 1495 /* runs on exception stack */
1480ENTRY(nmi) 1496ENTRY(nmi)
1481 INTR_FRAME 1497 INTR_FRAME
1482 PARAVIRT_ADJUST_EXCEPTION_FRAME 1498 PARAVIRT_ADJUST_EXCEPTION_FRAME
1483 pushq_cfi $-1 1499 /*
1500 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1501 * the iretq it performs will take us out of NMI context.
1502 * This means that we can have nested NMIs where the next
1503 * NMI is using the top of the stack of the previous NMI. We
1504 * can't let it execute because the nested NMI will corrupt the
1505 * stack of the previous NMI. NMI handlers are not re-entrant
1506 * anyway.
1507 *
1508 * To handle this case we do the following:
1509 * Check the a special location on the stack that contains
1510 * a variable that is set when NMIs are executing.
1511 * The interrupted task's stack is also checked to see if it
1512 * is an NMI stack.
1513 * If the variable is not set and the stack is not the NMI
1514 * stack then:
1515 * o Set the special variable on the stack
1516 * o Copy the interrupt frame into a "saved" location on the stack
1517 * o Copy the interrupt frame into a "copy" location on the stack
1518 * o Continue processing the NMI
1519 * If the variable is set or the previous stack is the NMI stack:
1520 * o Modify the "copy" location to jump to the repeate_nmi
1521 * o return back to the first NMI
1522 *
1523 * Now on exit of the first NMI, we first clear the stack variable
1524 * The NMI stack will tell any nested NMIs at that point that it is
1525 * nested. Then we pop the stack normally with iret, and if there was
1526 * a nested NMI that updated the copy interrupt stack frame, a
1527 * jump will be made to the repeat_nmi code that will handle the second
1528 * NMI.
1529 */
1530
1531 /* Use %rdx as out temp variable throughout */
1532 pushq_cfi %rdx
1533
1534 /*
1535 * Check the special variable on the stack to see if NMIs are
1536 * executing.
1537 */
1538 cmp $1, -8(%rsp)
1539 je nested_nmi
1540
1541 /*
1542 * Now test if the previous stack was an NMI stack.
1543 * We need the double check. We check the NMI stack to satisfy the
1544 * race when the first NMI clears the variable before returning.
1545 * We check the variable because the first NMI could be in a
1546 * breakpoint routine using a breakpoint stack.
1547 */
1548 lea 6*8(%rsp), %rdx
1549 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1550
1551nested_nmi:
1552 /*
1553 * Do nothing if we interrupted the fixup in repeat_nmi.
1554 * It's about to repeat the NMI handler, so we are fine
1555 * with ignoring this one.
1556 */
1557 movq $repeat_nmi, %rdx
1558 cmpq 8(%rsp), %rdx
1559 ja 1f
1560 movq $end_repeat_nmi, %rdx
1561 cmpq 8(%rsp), %rdx
1562 ja nested_nmi_out
1563
15641:
1565 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1566 leaq -6*8(%rsp), %rdx
1567 movq %rdx, %rsp
1568 CFI_ADJUST_CFA_OFFSET 6*8
1569 pushq_cfi $__KERNEL_DS
1570 pushq_cfi %rdx
1571 pushfq_cfi
1572 pushq_cfi $__KERNEL_CS
1573 pushq_cfi $repeat_nmi
1574
1575 /* Put stack back */
1576 addq $(11*8), %rsp
1577 CFI_ADJUST_CFA_OFFSET -11*8
1578
1579nested_nmi_out:
1580 popq_cfi %rdx
1581
1582 /* No need to check faults here */
1583 INTERRUPT_RETURN
1584
1585first_nmi:
1586 /*
1587 * Because nested NMIs will use the pushed location that we
1588 * stored in rdx, we must keep that space available.
1589 * Here's what our stack frame will look like:
1590 * +-------------------------+
1591 * | original SS |
1592 * | original Return RSP |
1593 * | original RFLAGS |
1594 * | original CS |
1595 * | original RIP |
1596 * +-------------------------+
1597 * | temp storage for rdx |
1598 * +-------------------------+
1599 * | NMI executing variable |
1600 * +-------------------------+
1601 * | Saved SS |
1602 * | Saved Return RSP |
1603 * | Saved RFLAGS |
1604 * | Saved CS |
1605 * | Saved RIP |
1606 * +-------------------------+
1607 * | copied SS |
1608 * | copied Return RSP |
1609 * | copied RFLAGS |
1610 * | copied CS |
1611 * | copied RIP |
1612 * +-------------------------+
1613 * | pt_regs |
1614 * +-------------------------+
1615 *
1616 * The saved RIP is used to fix up the copied RIP that a nested
1617 * NMI may zero out. The original stack frame and the temp storage
1618 * is also used by nested NMIs and can not be trusted on exit.
1619 */
1620 /* Set the NMI executing variable on the stack. */
1621 pushq_cfi $1
1622
1623 /* Copy the stack frame to the Saved frame */
1624 .rept 5
1625 pushq_cfi 6*8(%rsp)
1626 .endr
1627
1628 /* Make another copy, this one may be modified by nested NMIs */
1629 .rept 5
1630 pushq_cfi 4*8(%rsp)
1631 .endr
1632
1633 /* Do not pop rdx, nested NMIs will corrupt it */
1634 movq 11*8(%rsp), %rdx
1635
1636 /*
1637 * Everything below this point can be preempted by a nested
1638 * NMI if the first NMI took an exception. Repeated NMIs
1639 * caused by an exception and nested NMI will start here, and
1640 * can still be preempted by another NMI.
1641 */
1642restart_nmi:
1643 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1484 subq $ORIG_RAX-R15, %rsp 1644 subq $ORIG_RAX-R15, %rsp
1485 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1645 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1646 /*
1647 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1648 * as we should not be calling schedule in NMI context.
1649 * Even with normal interrupts enabled. An NMI should not be
1650 * setting NEED_RESCHED or anything that normal interrupts and
1651 * exceptions might do.
1652 */
1486 call save_paranoid 1653 call save_paranoid
1487 DEFAULT_FRAME 0 1654 DEFAULT_FRAME 0
1488 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1655 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1489 movq %rsp,%rdi 1656 movq %rsp,%rdi
1490 movq $-1,%rsi 1657 movq $-1,%rsi
1491 call do_nmi 1658 call do_nmi
1492#ifdef CONFIG_TRACE_IRQFLAGS
1493 /* paranoidexit; without TRACE_IRQS_OFF */
1494 /* ebx: no swapgs flag */
1495 DISABLE_INTERRUPTS(CLBR_NONE)
1496 testl %ebx,%ebx /* swapgs needed? */ 1659 testl %ebx,%ebx /* swapgs needed? */
1497 jnz nmi_restore 1660 jnz nmi_restore
1498 testl $3,CS(%rsp)
1499 jnz nmi_userspace
1500nmi_swapgs: 1661nmi_swapgs:
1501 SWAPGS_UNSAFE_STACK 1662 SWAPGS_UNSAFE_STACK
1502nmi_restore: 1663nmi_restore:
1503 RESTORE_ALL 8 1664 RESTORE_ALL 8
1665 /* Clear the NMI executing stack variable */
1666 movq $0, 10*8(%rsp)
1504 jmp irq_return 1667 jmp irq_return
1505nmi_userspace:
1506 GET_THREAD_INFO(%rcx)
1507 movl TI_flags(%rcx),%ebx
1508 andl $_TIF_WORK_MASK,%ebx
1509 jz nmi_swapgs
1510 movq %rsp,%rdi /* &pt_regs */
1511 call sync_regs
1512 movq %rax,%rsp /* switch stack for scheduling */
1513 testl $_TIF_NEED_RESCHED,%ebx
1514 jnz nmi_schedule
1515 movl %ebx,%edx /* arg3: thread flags */
1516 ENABLE_INTERRUPTS(CLBR_NONE)
1517 xorl %esi,%esi /* arg2: oldset */
1518 movq %rsp,%rdi /* arg1: &pt_regs */
1519 call do_notify_resume
1520 DISABLE_INTERRUPTS(CLBR_NONE)
1521 jmp nmi_userspace
1522nmi_schedule:
1523 ENABLE_INTERRUPTS(CLBR_ANY)
1524 call schedule
1525 DISABLE_INTERRUPTS(CLBR_ANY)
1526 jmp nmi_userspace
1527 CFI_ENDPROC 1668 CFI_ENDPROC
1528#else
1529 jmp paranoid_exit
1530 CFI_ENDPROC
1531#endif
1532END(nmi) 1669END(nmi)
1533 1670
1671 /*
1672 * If an NMI hit an iret because of an exception or breakpoint,
1673 * it can lose its NMI context, and a nested NMI may come in.
1674 * In that case, the nested NMI will change the preempted NMI's
1675 * stack to jump to here when it does the final iret.
1676 */
1677repeat_nmi:
1678 INTR_FRAME
1679 /* Update the stack variable to say we are still in NMI */
1680 movq $1, 5*8(%rsp)
1681
1682 /* copy the saved stack back to copy stack */
1683 .rept 5
1684 pushq_cfi 4*8(%rsp)
1685 .endr
1686
1687 jmp restart_nmi
1688 CFI_ENDPROC
1689end_repeat_nmi:
1690
1534ENTRY(ignore_sysret) 1691ENTRY(ignore_sysret)
1535 CFI_STARTPROC 1692 CFI_STARTPROC
1536 mov $-ENOSYS,%eax 1693 mov $-ENOSYS,%eax
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index be9282bcda72..51ff18616d50 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,8 +31,6 @@ static void __init i386_default_early_setup(void)
31 31
32void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
33{ 33{
34 memblock_init();
35
36 memblock_reserve(__pa_symbol(&_text), 34 memblock_reserve(__pa_symbol(&_text),
37 __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); 35 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
38 36
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index fd25b11549b8..3a3b779f41d3 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 memblock_init();
102
103 memblock_reserve(__pa_symbol(&_text), 101 memblock_reserve(__pa_symbol(&_text),
104 __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); 102 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
105 103
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11e39478a49..40f4eb3766d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -417,6 +417,10 @@ ENTRY(phys_base)
417ENTRY(idt_table) 417ENTRY(idt_table)
418 .skip IDT_ENTRIES * 16 418 .skip IDT_ENTRIES * 16
419 419
420 .align L1_CACHE_BYTES
421ENTRY(nmi_idt_table)
422 .skip IDT_ENTRIES * 16
423
420 __PAGE_ALIGNED_BSS 424 __PAGE_ALIGNED_BSS
421 .align PAGE_SIZE 425 .align PAGE_SIZE
422ENTRY(empty_zero_page) 426ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b946a9eac7d9..ad0de0c2714e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -2,7 +2,6 @@
2#include <linux/clockchips.h> 2#include <linux/clockchips.h>
3#include <linux/interrupt.h> 3#include <linux/interrupt.h>
4#include <linux/export.h> 4#include <linux/export.h>
5#include <linux/sysdev.h>
6#include <linux/delay.h> 5#include <linux/delay.h>
7#include <linux/errno.h> 6#include <linux/errno.h>
8#include <linux/i8253.h> 7#include <linux/i8253.h>
@@ -32,8 +31,6 @@
32#define HPET_MIN_CYCLES 128 31#define HPET_MIN_CYCLES 128
33#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) 32#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
34 33
35#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
36
37/* 34/*
38 * HPET address is set in acpi/boot.c, when an ACPI entry exists 35 * HPET address is set in acpi/boot.c, when an ACPI entry exists
39 */ 36 */
@@ -55,6 +52,11 @@ struct hpet_dev {
55 char name[10]; 52 char name[10];
56}; 53};
57 54
55inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
56{
57 return container_of(evtdev, struct hpet_dev, evt);
58}
59
58inline unsigned int hpet_readl(unsigned int a) 60inline unsigned int hpet_readl(unsigned int a)
59{ 61{
60 return readl(hpet_virt_address + a); 62 return readl(hpet_virt_address + a);
@@ -1049,6 +1051,14 @@ int hpet_rtc_timer_init(void)
1049} 1051}
1050EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); 1052EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
1051 1053
1054static void hpet_disable_rtc_channel(void)
1055{
1056 unsigned long cfg;
1057 cfg = hpet_readl(HPET_T1_CFG);
1058 cfg &= ~HPET_TN_ENABLE;
1059 hpet_writel(cfg, HPET_T1_CFG);
1060}
1061
1052/* 1062/*
1053 * The functions below are called from rtc driver. 1063 * The functions below are called from rtc driver.
1054 * Return 0 if HPET is not being used. 1064 * Return 0 if HPET is not being used.
@@ -1060,6 +1070,9 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
1060 return 0; 1070 return 0;
1061 1071
1062 hpet_rtc_flags &= ~bit_mask; 1072 hpet_rtc_flags &= ~bit_mask;
1073 if (unlikely(!hpet_rtc_flags))
1074 hpet_disable_rtc_channel();
1075
1063 return 1; 1076 return 1;
1064} 1077}
1065EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); 1078EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
@@ -1125,15 +1138,11 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1125 1138
1126static void hpet_rtc_timer_reinit(void) 1139static void hpet_rtc_timer_reinit(void)
1127{ 1140{
1128 unsigned int cfg, delta; 1141 unsigned int delta;
1129 int lost_ints = -1; 1142 int lost_ints = -1;
1130 1143
1131 if (unlikely(!hpet_rtc_flags)) { 1144 if (unlikely(!hpet_rtc_flags))
1132 cfg = hpet_readl(HPET_T1_CFG); 1145 hpet_disable_rtc_channel();
1133 cfg &= ~HPET_TN_ENABLE;
1134 hpet_writel(cfg, HPET_T1_CFG);
1135 return;
1136 }
1137 1146
1138 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1147 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
1139 delta = hpet_default_delta; 1148 delta = hpet_default_delta;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 429e0c92924e..7943e0c21bde 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec)
74 for_each_online_cpu(j) 74 for_each_online_cpu(j)
75 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); 75 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
76 seq_printf(p, " IRQ work interrupts\n"); 76 seq_printf(p, " IRQ work interrupts\n");
77 seq_printf(p, "%*s: ", prec, "RTR");
78 for_each_online_cpu(j)
79 seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
80 seq_printf(p, " APIC ICR read retries\n");
77#endif 81#endif
78 if (x86_platform_ipi_callback) { 82 if (x86_platform_ipi_callback) {
79 seq_printf(p, "%*s: ", prec, "PLT"); 83 seq_printf(p, "%*s: ", prec, "PLT");
@@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
136 sum += irq_stats(cpu)->irq_spurious_count; 140 sum += irq_stats(cpu)->irq_spurious_count;
137 sum += irq_stats(cpu)->apic_perf_irqs; 141 sum += irq_stats(cpu)->apic_perf_irqs;
138 sum += irq_stats(cpu)->apic_irq_work_irqs; 142 sum += irq_stats(cpu)->apic_irq_work_irqs;
143 sum += irq_stats(cpu)->icr_read_retry_count;
139#endif 144#endif
140 if (x86_platform_ipi_callback) 145 if (x86_platform_ipi_callback)
141 sum += irq_stats(cpu)->x86_platform_ipis; 146 sum += irq_stats(cpu)->x86_platform_ipis;
@@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
181 unsigned vector = ~regs->orig_ax; 186 unsigned vector = ~regs->orig_ax;
182 unsigned irq; 187 unsigned irq;
183 188
184 exit_idle();
185 irq_enter(); 189 irq_enter();
190 exit_idle();
186 191
187 irq = __this_cpu_read(vector_irq[vector]); 192 irq = __this_cpu_read(vector_irq[vector]);
188 193
@@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
209 214
210 ack_APIC_irq(); 215 ack_APIC_irq();
211 216
212 exit_idle();
213
214 irq_enter(); 217 irq_enter();
215 218
219 exit_idle();
220
216 inc_irq_stat(x86_platform_ipis); 221 inc_irq_stat(x86_platform_ipis);
217 222
218 if (x86_platform_ipi_callback) 223 if (x86_platform_ipi_callback)
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a656..40fc86161d92 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs); 28EXPORT_PER_CPU_SYMBOL(irq_regs);
29 29
30#ifdef CONFIG_DEBUG_STACKOVERFLOW 30#ifdef CONFIG_DEBUG_STACKOVERFLOW
31
32int sysctl_panic_on_stackoverflow __read_mostly;
33
31/* Debugging check for stack overflow: is there less than 1KB free? */ 34/* Debugging check for stack overflow: is there less than 1KB free? */
32static int check_stack_overflow(void) 35static int check_stack_overflow(void)
33{ 36{
@@ -43,6 +46,8 @@ static void print_stack_overflow(void)
43{ 46{
44 printk(KERN_WARNING "low stack detected by irq handler\n"); 47 printk(KERN_WARNING "low stack detected by irq handler\n");
45 dump_stack(); 48 dump_stack();
49 if (sysctl_panic_on_stackoverflow)
50 panic("low stack detected by irq handler - check messages\n");
46} 51}
47 52
48#else 53#else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index acf8fbf8fbda..d04d3ecded62 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
26DEFINE_PER_CPU(struct pt_regs *, irq_regs); 26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs); 27EXPORT_PER_CPU_SYMBOL(irq_regs);
28 28
29int sysctl_panic_on_stackoverflow;
30
29/* 31/*
30 * Probabilistic stack overflow check: 32 * Probabilistic stack overflow check:
31 * 33 *
@@ -36,15 +38,39 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
36static inline void stack_overflow_check(struct pt_regs *regs) 38static inline void stack_overflow_check(struct pt_regs *regs)
37{ 39{
38#ifdef CONFIG_DEBUG_STACKOVERFLOW 40#ifdef CONFIG_DEBUG_STACKOVERFLOW
41#define STACK_TOP_MARGIN 128
42 struct orig_ist *oist;
43 u64 irq_stack_top, irq_stack_bottom;
44 u64 estack_top, estack_bottom;
39 u64 curbase = (u64)task_stack_page(current); 45 u64 curbase = (u64)task_stack_page(current);
40 46
41 WARN_ONCE(regs->sp >= curbase && 47 if (user_mode_vm(regs))
42 regs->sp <= curbase + THREAD_SIZE && 48 return;
43 regs->sp < curbase + sizeof(struct thread_info) + 49
44 sizeof(struct pt_regs) + 128, 50 if (regs->sp >= curbase + sizeof(struct thread_info) +
51 sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
52 regs->sp <= curbase + THREAD_SIZE)
53 return;
54
55 irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
56 STACK_TOP_MARGIN;
57 irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
58 if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
59 return;
60
61 oist = &__get_cpu_var(orig_ist);
62 estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
63 estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
64 if (regs->sp >= estack_top && regs->sp <= estack_bottom)
65 return;
66
67 WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
68 current->comm, curbase, regs->sp,
69 irq_stack_top, irq_stack_bottom,
70 estack_top, estack_bottom);
45 71
46 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", 72 if (sysctl_panic_on_stackoverflow)
47 current->comm, curbase, regs->sp); 73 panic("low stack detected by irq handler - check messages\n");
48#endif 74#endif
49} 75}
50 76
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b3300e6bacef..313fb5cddbce 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h> 12#include <linux/device.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include <linux/acpi.h> 14#include <linux/acpi.h>
15#include <linux/io.h> 15#include <linux/io.h>
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index ea9d5f2f13ef..2889b3d43882 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
50 put_online_cpus(); 50 put_online_cpus();
51} 51}
52 52
53void arch_jump_label_transform_static(struct jump_entry *entry, 53__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
54 enum jump_label_type type) 54 enum jump_label_type type)
55{ 55{
56 __jump_label_transform(entry, type, text_poke_early); 56 __jump_label_transform(entry, type, text_poke_early);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a9c2116001d6..f0c6fd6f176b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,8 +39,6 @@
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41 41
42#define MMU_QUEUE_SIZE 1024
43
44static int kvmapf = 1; 42static int kvmapf = 1;
45 43
46static int parse_no_kvmapf(char *arg) 44static int parse_no_kvmapf(char *arg)
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)
60 58
61early_param("no-steal-acc", parse_no_stealacc); 59early_param("no-steal-acc", parse_no_stealacc);
62 60
63struct kvm_para_state {
64 u8 mmu_queue[MMU_QUEUE_SIZE];
65 int mmu_queue_len;
66};
67
68static DEFINE_PER_CPU(struct kvm_para_state, para_state);
69static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 61static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
70static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 62static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
71static int has_steal_clock = 0; 63static int has_steal_clock = 0;
72 64
73static struct kvm_para_state *kvm_para_state(void)
74{
75 return &per_cpu(para_state, raw_smp_processor_id());
76}
77
78/* 65/*
79 * No need for any "IO delay" on KVM 66 * No need for any "IO delay" on KVM
80 */ 67 */
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
271 } 258 }
272} 259}
273 260
274static void kvm_mmu_op(void *buffer, unsigned len)
275{
276 int r;
277 unsigned long a1, a2;
278
279 do {
280 a1 = __pa(buffer);
281 a2 = 0; /* on i386 __pa() always returns <4G */
282 r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
283 buffer += r;
284 len -= r;
285 } while (len);
286}
287
288static void mmu_queue_flush(struct kvm_para_state *state)
289{
290 if (state->mmu_queue_len) {
291 kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
292 state->mmu_queue_len = 0;
293 }
294}
295
296static void kvm_deferred_mmu_op(void *buffer, int len)
297{
298 struct kvm_para_state *state = kvm_para_state();
299
300 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
301 kvm_mmu_op(buffer, len);
302 return;
303 }
304 if (state->mmu_queue_len + len > sizeof state->mmu_queue)
305 mmu_queue_flush(state);
306 memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
307 state->mmu_queue_len += len;
308}
309
310static void kvm_mmu_write(void *dest, u64 val)
311{
312 __u64 pte_phys;
313 struct kvm_mmu_op_write_pte wpte;
314
315#ifdef CONFIG_HIGHPTE
316 struct page *page;
317 unsigned long dst = (unsigned long) dest;
318
319 page = kmap_atomic_to_page(dest);
320 pte_phys = page_to_pfn(page);
321 pte_phys <<= PAGE_SHIFT;
322 pte_phys += (dst & ~(PAGE_MASK));
323#else
324 pte_phys = (unsigned long)__pa(dest);
325#endif
326 wpte.header.op = KVM_MMU_OP_WRITE_PTE;
327 wpte.pte_val = val;
328 wpte.pte_phys = pte_phys;
329
330 kvm_deferred_mmu_op(&wpte, sizeof wpte);
331}
332
333/*
334 * We only need to hook operations that are MMU writes. We hook these so that
335 * we can use lazy MMU mode to batch these operations. We could probably
336 * improve the performance of the host code if we used some of the information
337 * here to simplify processing of batched writes.
338 */
339static void kvm_set_pte(pte_t *ptep, pte_t pte)
340{
341 kvm_mmu_write(ptep, pte_val(pte));
342}
343
344static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
345 pte_t *ptep, pte_t pte)
346{
347 kvm_mmu_write(ptep, pte_val(pte));
348}
349
350static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
351{
352 kvm_mmu_write(pmdp, pmd_val(pmd));
353}
354
355#if PAGETABLE_LEVELS >= 3
356#ifdef CONFIG_X86_PAE
357static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
358{
359 kvm_mmu_write(ptep, pte_val(pte));
360}
361
362static void kvm_pte_clear(struct mm_struct *mm,
363 unsigned long addr, pte_t *ptep)
364{
365 kvm_mmu_write(ptep, 0);
366}
367
368static void kvm_pmd_clear(pmd_t *pmdp)
369{
370 kvm_mmu_write(pmdp, 0);
371}
372#endif
373
374static void kvm_set_pud(pud_t *pudp, pud_t pud)
375{
376 kvm_mmu_write(pudp, pud_val(pud));
377}
378
379#if PAGETABLE_LEVELS == 4
380static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
381{
382 kvm_mmu_write(pgdp, pgd_val(pgd));
383}
384#endif
385#endif /* PAGETABLE_LEVELS >= 3 */
386
387static void kvm_flush_tlb(void)
388{
389 struct kvm_mmu_op_flush_tlb ftlb = {
390 .header.op = KVM_MMU_OP_FLUSH_TLB,
391 };
392
393 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
394}
395
396static void kvm_release_pt(unsigned long pfn)
397{
398 struct kvm_mmu_op_release_pt rpt = {
399 .header.op = KVM_MMU_OP_RELEASE_PT,
400 .pt_phys = (u64)pfn << PAGE_SHIFT,
401 };
402
403 kvm_mmu_op(&rpt, sizeof rpt);
404}
405
406static void kvm_enter_lazy_mmu(void)
407{
408 paravirt_enter_lazy_mmu();
409}
410
411static void kvm_leave_lazy_mmu(void)
412{
413 struct kvm_para_state *state = kvm_para_state();
414
415 mmu_queue_flush(state);
416 paravirt_leave_lazy_mmu();
417}
418
419static void __init paravirt_ops_setup(void) 261static void __init paravirt_ops_setup(void)
420{ 262{
421 pv_info.name = "KVM"; 263 pv_info.name = "KVM";
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)
424 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 266 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
425 pv_cpu_ops.io_delay = kvm_io_delay; 267 pv_cpu_ops.io_delay = kvm_io_delay;
426 268
427 if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
428 pv_mmu_ops.set_pte = kvm_set_pte;
429 pv_mmu_ops.set_pte_at = kvm_set_pte_at;
430 pv_mmu_ops.set_pmd = kvm_set_pmd;
431#if PAGETABLE_LEVELS >= 3
432#ifdef CONFIG_X86_PAE
433 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
434 pv_mmu_ops.pte_clear = kvm_pte_clear;
435 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
436#endif
437 pv_mmu_ops.set_pud = kvm_set_pud;
438#if PAGETABLE_LEVELS == 4
439 pv_mmu_ops.set_pgd = kvm_set_pgd;
440#endif
441#endif
442 pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
443 pv_mmu_ops.release_pte = kvm_release_pt;
444 pv_mmu_ops.release_pmd = kvm_release_pt;
445 pv_mmu_ops.release_pud = kvm_release_pt;
446
447 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
448 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
449 }
450#ifdef CONFIG_X86_IO_APIC 269#ifdef CONFIG_X86_IO_APIC
451 no_timer_check = 1; 270 no_timer_check = 1;
452#endif 271#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index d494799aafcd..fe86493f3ed1 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -1,14 +1,18 @@
1/* 1/*
2 * AMD CPU Microcode Update Driver for Linux 2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc. 3 * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
4 * 4 *
5 * Author: Peter Oruba <peter.oruba@amd.com> 5 * Author: Peter Oruba <peter.oruba@amd.com>
6 * 6 *
7 * Based on work by: 7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> 8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 * 9 *
10 * This driver allows to upgrade microcode on AMD 10 * Maintainers:
11 * family 0x10 and 0x11 processors. 11 * Andreas Herrmann <andreas.herrmann3@amd.com>
12 * Borislav Petkov <borislav.petkov@amd.com>
13 *
14 * This driver allows to upgrade microcode on F10h AMD
15 * CPUs and later.
12 * 16 *
13 * Licensed under the terms of the GNU General Public 17 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 18 * License version 2. See file COPYING for details.
@@ -71,6 +75,9 @@ struct microcode_amd {
71 75
72static struct equiv_cpu_entry *equiv_cpu_table; 76static struct equiv_cpu_entry *equiv_cpu_table;
73 77
78/* page-sized ucode patch buffer */
79void *patch;
80
74static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 81static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
75{ 82{
76 struct cpuinfo_x86 *c = &cpu_data(cpu); 83 struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
86 return 0; 93 return 0;
87} 94}
88 95
89static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, 96static unsigned int verify_ucode_size(int cpu, u32 patch_size,
90 int rev) 97 unsigned int size)
91{ 98{
92 unsigned int current_cpu_id; 99 struct cpuinfo_x86 *c = &cpu_data(cpu);
93 u16 equiv_cpu_id = 0; 100 u32 max_size;
94 unsigned int i = 0; 101
102#define F1XH_MPB_MAX_SIZE 2048
103#define F14H_MPB_MAX_SIZE 1824
104#define F15H_MPB_MAX_SIZE 4096
105
106 switch (c->x86) {
107 case 0x14:
108 max_size = F14H_MPB_MAX_SIZE;
109 break;
110 case 0x15:
111 max_size = F15H_MPB_MAX_SIZE;
112 break;
113 default:
114 max_size = F1XH_MPB_MAX_SIZE;
115 break;
116 }
117
118 if (patch_size > min_t(u32, size, max_size)) {
119 pr_err("patch size mismatch\n");
120 return 0;
121 }
122
123 return patch_size;
124}
125
126static u16 find_equiv_id(void)
127{
128 unsigned int current_cpu_id, i = 0;
95 129
96 BUG_ON(equiv_cpu_table == NULL); 130 BUG_ON(equiv_cpu_table == NULL);
131
97 current_cpu_id = cpuid_eax(0x00000001); 132 current_cpu_id = cpuid_eax(0x00000001);
98 133
99 while (equiv_cpu_table[i].installed_cpu != 0) { 134 while (equiv_cpu_table[i].installed_cpu != 0) {
100 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { 135 if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
101 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; 136 return equiv_cpu_table[i].equiv_cpu;
102 break; 137
103 }
104 i++; 138 i++;
105 } 139 }
140 return 0;
141}
106 142
143/*
144 * we signal a good patch is found by returning its size > 0
145 */
146static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
147 unsigned int leftover_size, int rev,
148 unsigned int *current_size)
149{
150 struct microcode_header_amd *mc_hdr;
151 unsigned int actual_size;
152 u16 equiv_cpu_id;
153
154 /* size of the current patch we're staring at */
155 *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
156
157 equiv_cpu_id = find_equiv_id();
107 if (!equiv_cpu_id) 158 if (!equiv_cpu_id)
108 return 0; 159 return 0;
109 160
161 /*
162 * let's look at the patch header itself now
163 */
164 mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
165
110 if (mc_hdr->processor_rev_id != equiv_cpu_id) 166 if (mc_hdr->processor_rev_id != equiv_cpu_id)
111 return 0; 167 return 0;
112 168
@@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
120 if (mc_hdr->patch_id <= rev) 176 if (mc_hdr->patch_id <= rev)
121 return 0; 177 return 0;
122 178
123 return 1; 179 /*
180 * now that the header looks sane, verify its size
181 */
182 actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
183 if (!actual_size)
184 return 0;
185
186 /* clear the patch buffer */
187 memset(patch, 0, PAGE_SIZE);
188
189 /* all looks ok, get the binary patch */
190 get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
191
192 return actual_size;
124} 193}
125 194
126static int apply_microcode_amd(int cpu) 195static int apply_microcode_amd(int cpu)
@@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu)
155 return 0; 224 return 0;
156} 225}
157 226
158static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
159{
160 struct cpuinfo_x86 *c = &cpu_data(cpu);
161 u32 max_size, actual_size;
162
163#define F1XH_MPB_MAX_SIZE 2048
164#define F14H_MPB_MAX_SIZE 1824
165#define F15H_MPB_MAX_SIZE 4096
166
167 switch (c->x86) {
168 case 0x14:
169 max_size = F14H_MPB_MAX_SIZE;
170 break;
171 case 0x15:
172 max_size = F15H_MPB_MAX_SIZE;
173 break;
174 default:
175 max_size = F1XH_MPB_MAX_SIZE;
176 break;
177 }
178
179 actual_size = *(u32 *)(buf + 4);
180
181 if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
182 pr_err("section size mismatch\n");
183 return 0;
184 }
185
186 return actual_size;
187}
188
189static struct microcode_header_amd *
190get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
191{
192 struct microcode_header_amd *mc = NULL;
193 unsigned int actual_size = 0;
194
195 if (*(u32 *)buf != UCODE_UCODE_TYPE) {
196 pr_err("invalid type field in container file section header\n");
197 goto out;
198 }
199
200 actual_size = verify_ucode_size(cpu, buf, size);
201 if (!actual_size)
202 goto out;
203
204 mc = vzalloc(actual_size);
205 if (!mc)
206 goto out;
207
208 get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
209 *mc_size = actual_size + SECTION_HDR_SIZE;
210
211out:
212 return mc;
213}
214
215static int install_equiv_cpu_table(const u8 *buf) 227static int install_equiv_cpu_table(const u8 *buf)
216{ 228{
217 unsigned int *ibuf = (unsigned int *)buf; 229 unsigned int *ibuf = (unsigned int *)buf;
@@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
247{ 259{
248 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 260 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
249 struct microcode_header_amd *mc_hdr = NULL; 261 struct microcode_header_amd *mc_hdr = NULL;
250 unsigned int mc_size, leftover; 262 unsigned int mc_size, leftover, current_size = 0;
251 int offset; 263 int offset;
252 const u8 *ucode_ptr = data; 264 const u8 *ucode_ptr = data;
253 void *new_mc = NULL; 265 void *new_mc = NULL;
254 unsigned int new_rev = uci->cpu_sig.rev; 266 unsigned int new_rev = uci->cpu_sig.rev;
255 enum ucode_state state = UCODE_OK; 267 enum ucode_state state = UCODE_ERROR;
256 268
257 offset = install_equiv_cpu_table(ucode_ptr); 269 offset = install_equiv_cpu_table(ucode_ptr);
258 if (offset < 0) { 270 if (offset < 0) {
259 pr_err("failed to create equivalent cpu table\n"); 271 pr_err("failed to create equivalent cpu table\n");
260 return UCODE_ERROR; 272 goto out;
261 } 273 }
262
263 ucode_ptr += offset; 274 ucode_ptr += offset;
264 leftover = size - offset; 275 leftover = size - offset;
265 276
266 while (leftover) { 277 if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
267 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); 278 pr_err("invalid type field in container file section header\n");
268 if (!mc_hdr) 279 goto free_table;
269 break; 280 }
270 281
271 if (get_matching_microcode(cpu, mc_hdr, new_rev)) { 282 while (leftover) {
272 vfree(new_mc); 283 mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
284 new_rev, &current_size);
285 if (mc_size) {
286 mc_hdr = patch;
287 new_mc = patch;
273 new_rev = mc_hdr->patch_id; 288 new_rev = mc_hdr->patch_id;
274 new_mc = mc_hdr; 289 goto out_ok;
275 } else 290 }
276 vfree(mc_hdr);
277 291
278 ucode_ptr += mc_size; 292 ucode_ptr += current_size;
279 leftover -= mc_size; 293 leftover -= current_size;
280 } 294 }
281 295
282 if (!new_mc) { 296 if (!new_mc) {
@@ -284,19 +298,16 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
284 goto free_table; 298 goto free_table;
285 } 299 }
286 300
287 if (!leftover) { 301out_ok:
288 vfree(uci->mc); 302 uci->mc = new_mc;
289 uci->mc = new_mc; 303 state = UCODE_OK;
290 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", 304 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
291 cpu, uci->cpu_sig.rev, new_rev); 305 cpu, uci->cpu_sig.rev, new_rev);
292 } else {
293 vfree(new_mc);
294 state = UCODE_ERROR;
295 }
296 306
297free_table: 307free_table:
298 free_equiv_cpu_table(); 308 free_equiv_cpu_table();
299 309
310out:
300 return state; 311 return state;
301} 312}
302 313
@@ -337,7 +348,6 @@ static void microcode_fini_cpu_amd(int cpu)
337{ 348{
338 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 349 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
339 350
340 vfree(uci->mc);
341 uci->mc = NULL; 351 uci->mc = NULL;
342} 352}
343 353
@@ -351,5 +361,14 @@ static struct microcode_ops microcode_amd_ops = {
351 361
352struct microcode_ops * __init init_amd_microcode(void) 362struct microcode_ops * __init init_amd_microcode(void)
353{ 363{
364 patch = (void *)get_zeroed_page(GFP_KERNEL);
365 if (!patch)
366 return NULL;
367
354 return &microcode_amd_ops; 368 return &microcode_amd_ops;
355} 369}
370
371void __exit exit_amd_microcode(void)
372{
373 free_page((unsigned long)patch);
374}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index f2d2a664e797..fda91c307104 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -256,7 +256,7 @@ static int __init microcode_dev_init(void)
256 return 0; 256 return 0;
257} 257}
258 258
259static void microcode_dev_exit(void) 259static void __exit microcode_dev_exit(void)
260{ 260{
261 misc_deregister(&microcode_dev); 261 misc_deregister(&microcode_dev);
262} 262}
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
292 return err; 292 return err;
293} 293}
294 294
295static ssize_t reload_store(struct sys_device *dev, 295static ssize_t reload_store(struct device *dev,
296 struct sysdev_attribute *attr, 296 struct device_attribute *attr,
297 const char *buf, size_t size) 297 const char *buf, size_t size)
298{ 298{
299 unsigned long val; 299 unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
318 return ret; 318 return ret;
319} 319}
320 320
321static ssize_t version_show(struct sys_device *dev, 321static ssize_t version_show(struct device *dev,
322 struct sysdev_attribute *attr, char *buf) 322 struct device_attribute *attr, char *buf)
323{ 323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 324 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
325 325
326 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); 326 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
327} 327}
328 328
329static ssize_t pf_show(struct sys_device *dev, 329static ssize_t pf_show(struct device *dev,
330 struct sysdev_attribute *attr, char *buf) 330 struct device_attribute *attr, char *buf)
331{ 331{
332 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 332 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
333 333
334 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); 334 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
335} 335}
336 336
337static SYSDEV_ATTR(reload, 0200, NULL, reload_store); 337static DEVICE_ATTR(reload, 0200, NULL, reload_store);
338static SYSDEV_ATTR(version, 0400, version_show, NULL); 338static DEVICE_ATTR(version, 0400, version_show, NULL);
339static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); 339static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
340 340
341static struct attribute *mc_default_attrs[] = { 341static struct attribute *mc_default_attrs[] = {
342 &attr_reload.attr, 342 &dev_attr_reload.attr,
343 &attr_version.attr, 343 &dev_attr_version.attr,
344 &attr_processor_flags.attr, 344 &dev_attr_processor_flags.attr,
345 NULL 345 NULL
346}; 346};
347 347
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
405 return ustate; 405 return ustate;
406} 406}
407 407
408static int mc_sysdev_add(struct sys_device *sys_dev) 408static int mc_device_add(struct device *dev, struct subsys_interface *sif)
409{ 409{
410 int err, cpu = sys_dev->id; 410 int err, cpu = dev->id;
411 411
412 if (!cpu_online(cpu)) 412 if (!cpu_online(cpu))
413 return 0; 413 return 0;
414 414
415 pr_debug("CPU%d added\n", cpu); 415 pr_debug("CPU%d added\n", cpu);
416 416
417 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 417 err = sysfs_create_group(&dev->kobj, &mc_attr_group);
418 if (err) 418 if (err)
419 return err; 419 return err;
420 420
421 if (microcode_init_cpu(cpu) == UCODE_ERROR) { 421 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
422 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 422 sysfs_remove_group(&dev->kobj, &mc_attr_group);
423 return -EINVAL; 423 return -EINVAL;
424 } 424 }
425 425
426 return err; 426 return err;
427} 427}
428 428
429static int mc_sysdev_remove(struct sys_device *sys_dev) 429static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
430{ 430{
431 int cpu = sys_dev->id; 431 int cpu = dev->id;
432 432
433 if (!cpu_online(cpu)) 433 if (!cpu_online(cpu))
434 return 0; 434 return 0;
435 435
436 pr_debug("CPU%d removed\n", cpu); 436 pr_debug("CPU%d removed\n", cpu);
437 microcode_fini_cpu(cpu); 437 microcode_fini_cpu(cpu);
438 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 438 sysfs_remove_group(&dev->kobj, &mc_attr_group);
439 return 0; 439 return 0;
440} 440}
441 441
442static struct sysdev_driver mc_sysdev_driver = { 442static struct subsys_interface mc_cpu_interface = {
443 .add = mc_sysdev_add, 443 .name = "microcode",
444 .remove = mc_sysdev_remove, 444 .subsys = &cpu_subsys,
445 .add_dev = mc_device_add,
446 .remove_dev = mc_device_remove,
445}; 447};
446 448
447/** 449/**
@@ -464,9 +466,9 @@ static __cpuinit int
464mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) 466mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
465{ 467{
466 unsigned int cpu = (unsigned long)hcpu; 468 unsigned int cpu = (unsigned long)hcpu;
467 struct sys_device *sys_dev; 469 struct device *dev;
468 470
469 sys_dev = get_cpu_sysdev(cpu); 471 dev = get_cpu_device(cpu);
470 switch (action) { 472 switch (action) {
471 case CPU_ONLINE: 473 case CPU_ONLINE:
472 case CPU_ONLINE_FROZEN: 474 case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
474 case CPU_DOWN_FAILED: 476 case CPU_DOWN_FAILED:
475 case CPU_DOWN_FAILED_FROZEN: 477 case CPU_DOWN_FAILED_FROZEN:
476 pr_debug("CPU%d added\n", cpu); 478 pr_debug("CPU%d added\n", cpu);
477 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 479 if (sysfs_create_group(&dev->kobj, &mc_attr_group))
478 pr_err("Failed to create group for CPU%d\n", cpu); 480 pr_err("Failed to create group for CPU%d\n", cpu);
479 break; 481 break;
480 case CPU_DOWN_PREPARE: 482 case CPU_DOWN_PREPARE:
481 case CPU_DOWN_PREPARE_FROZEN: 483 case CPU_DOWN_PREPARE_FROZEN:
482 /* Suspend is in progress, only remove the interface */ 484 /* Suspend is in progress, only remove the interface */
483 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 485 sysfs_remove_group(&dev->kobj, &mc_attr_group);
484 pr_debug("CPU%d removed\n", cpu); 486 pr_debug("CPU%d removed\n", cpu);
485 break; 487 break;
486 488
@@ -519,27 +521,23 @@ static int __init microcode_init(void)
519 521
520 microcode_pdev = platform_device_register_simple("microcode", -1, 522 microcode_pdev = platform_device_register_simple("microcode", -1,
521 NULL, 0); 523 NULL, 0);
522 if (IS_ERR(microcode_pdev)) { 524 if (IS_ERR(microcode_pdev))
523 microcode_dev_exit();
524 return PTR_ERR(microcode_pdev); 525 return PTR_ERR(microcode_pdev);
525 }
526 526
527 get_online_cpus(); 527 get_online_cpus();
528 mutex_lock(&microcode_mutex); 528 mutex_lock(&microcode_mutex);
529 529
530 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 530 error = subsys_interface_register(&mc_cpu_interface);
531 531
532 mutex_unlock(&microcode_mutex); 532 mutex_unlock(&microcode_mutex);
533 put_online_cpus(); 533 put_online_cpus();
534 534
535 if (error) { 535 if (error)
536 platform_device_unregister(microcode_pdev); 536 goto out_pdev;
537 return error;
538 }
539 537
540 error = microcode_dev_init(); 538 error = microcode_dev_init();
541 if (error) 539 if (error)
542 return error; 540 goto out_driver;
543 541
544 register_syscore_ops(&mc_syscore_ops); 542 register_syscore_ops(&mc_syscore_ops);
545 register_hotcpu_notifier(&mc_cpu_notifier); 543 register_hotcpu_notifier(&mc_cpu_notifier);
@@ -548,11 +546,27 @@ static int __init microcode_init(void)
548 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); 546 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
549 547
550 return 0; 548 return 0;
549
550out_driver:
551 get_online_cpus();
552 mutex_lock(&microcode_mutex);
553
554 subsys_interface_unregister(&mc_cpu_interface);
555
556 mutex_unlock(&microcode_mutex);
557 put_online_cpus();
558
559out_pdev:
560 platform_device_unregister(microcode_pdev);
561 return error;
562
551} 563}
552module_init(microcode_init); 564module_init(microcode_init);
553 565
554static void __exit microcode_exit(void) 566static void __exit microcode_exit(void)
555{ 567{
568 struct cpuinfo_x86 *c = &cpu_data(0);
569
556 microcode_dev_exit(); 570 microcode_dev_exit();
557 571
558 unregister_hotcpu_notifier(&mc_cpu_notifier); 572 unregister_hotcpu_notifier(&mc_cpu_notifier);
@@ -561,7 +575,7 @@ static void __exit microcode_exit(void)
561 get_online_cpus(); 575 get_online_cpus();
562 mutex_lock(&microcode_mutex); 576 mutex_lock(&microcode_mutex);
563 577
564 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 578 subsys_interface_unregister(&mc_cpu_interface);
565 579
566 mutex_unlock(&microcode_mutex); 580 mutex_unlock(&microcode_mutex);
567 put_online_cpus(); 581 put_online_cpus();
@@ -570,6 +584,9 @@ static void __exit microcode_exit(void)
570 584
571 microcode_ops = NULL; 585 microcode_ops = NULL;
572 586
587 if (c->x86_vendor == X86_VENDOR_AMD)
588 exit_amd_microcode();
589
573 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 590 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
574} 591}
575module_exit(microcode_exit); 592module_exit(microcode_exit);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index a6b79c16ec78..ca470e4c92dc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
95 } 95 }
96#endif 96#endif
97 97
98 set_bit(m->busid, mp_bus_not_pci);
98 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { 99 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
99 set_bit(m->busid, mp_bus_not_pci);
100#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 100#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
101 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 101 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
102#endif 102#endif
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2c143e..96356762a51d 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
236 .notifier_call = msr_class_cpu_callback, 236 .notifier_call = msr_class_cpu_callback,
237}; 237};
238 238
239static char *msr_devnode(struct device *dev, mode_t *mode) 239static char *msr_devnode(struct device *dev, umode_t *mode)
240{ 240{
241 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); 241 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
242} 242}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e88f37b58ddd..47acaf319165 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
405 unknown_nmi_error(reason, regs); 405 unknown_nmi_error(reason, regs);
406} 406}
407 407
408/*
409 * NMIs can hit breakpoints which will cause it to lose its
410 * NMI context with the CPU when the breakpoint does an iret.
411 */
412#ifdef CONFIG_X86_32
413/*
414 * For i386, NMIs use the same stack as the kernel, and we can
415 * add a workaround to the iret problem in C. Simply have 3 states
416 * the NMI can be in.
417 *
418 * 1) not running
419 * 2) executing
420 * 3) latched
421 *
422 * When no NMI is in progress, it is in the "not running" state.
423 * When an NMI comes in, it goes into the "executing" state.
424 * Normally, if another NMI is triggered, it does not interrupt
425 * the running NMI and the HW will simply latch it so that when
426 * the first NMI finishes, it will restart the second NMI.
427 * (Note, the latch is binary, thus multiple NMIs triggering,
428 * when one is running, are ignored. Only one NMI is restarted.)
429 *
430 * If an NMI hits a breakpoint that executes an iret, another
431 * NMI can preempt it. We do not want to allow this new NMI
432 * to run, but we want to execute it when the first one finishes.
433 * We set the state to "latched", and the first NMI will perform
434 * an cmpxchg on the state, and if it doesn't successfully
435 * reset the state to "not running" it will restart the next
436 * NMI.
437 */
438enum nmi_states {
439 NMI_NOT_RUNNING,
440 NMI_EXECUTING,
441 NMI_LATCHED,
442};
443static DEFINE_PER_CPU(enum nmi_states, nmi_state);
444
445#define nmi_nesting_preprocess(regs) \
446 do { \
447 if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \
448 __get_cpu_var(nmi_state) = NMI_LATCHED; \
449 return; \
450 } \
451 nmi_restart: \
452 __get_cpu_var(nmi_state) = NMI_EXECUTING; \
453 } while (0)
454
455#define nmi_nesting_postprocess() \
456 do { \
457 if (cmpxchg(&__get_cpu_var(nmi_state), \
458 NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \
459 goto nmi_restart; \
460 } while (0)
461#else /* x86_64 */
462/*
463 * In x86_64 things are a bit more difficult. This has the same problem
464 * where an NMI hitting a breakpoint that calls iret will remove the
465 * NMI context, allowing a nested NMI to enter. What makes this more
466 * difficult is that both NMIs and breakpoints have their own stack.
467 * When a new NMI or breakpoint is executed, the stack is set to a fixed
468 * point. If an NMI is nested, it will have its stack set at that same
469 * fixed address that the first NMI had, and will start corrupting the
470 * stack. This is handled in entry_64.S, but the same problem exists with
471 * the breakpoint stack.
472 *
473 * If a breakpoint is being processed, and the debug stack is being used,
474 * if an NMI comes in and also hits a breakpoint, the stack pointer
475 * will be set to the same fixed address as the breakpoint that was
476 * interrupted, causing that stack to be corrupted. To handle this case,
477 * check if the stack that was interrupted is the debug stack, and if
478 * so, change the IDT so that new breakpoints will use the current stack
479 * and not switch to the fixed address. On return of the NMI, switch back
480 * to the original IDT.
481 */
482static DEFINE_PER_CPU(int, update_debug_stack);
483
484static inline void nmi_nesting_preprocess(struct pt_regs *regs)
485{
486 /*
487 * If we interrupted a breakpoint, it is possible that
488 * the nmi handler will have breakpoints too. We need to
489 * change the IDT such that breakpoints that happen here
490 * continue to use the NMI stack.
491 */
492 if (unlikely(is_debug_stack(regs->sp))) {
493 debug_stack_set_zero();
494 __get_cpu_var(update_debug_stack) = 1;
495 }
496}
497
498static inline void nmi_nesting_postprocess(void)
499{
500 if (unlikely(__get_cpu_var(update_debug_stack)))
501 debug_stack_reset();
502}
503#endif
504
408dotraplinkage notrace __kprobes void 505dotraplinkage notrace __kprobes void
409do_nmi(struct pt_regs *regs, long error_code) 506do_nmi(struct pt_regs *regs, long error_code)
410{ 507{
508 nmi_nesting_preprocess(regs);
509
411 nmi_enter(); 510 nmi_enter();
412 511
413 inc_irq_stat(__nmi_count); 512 inc_irq_stat(__nmi_count);
@@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code)
416 default_do_nmi(regs); 515 default_do_nmi(regs);
417 516
418 nmi_exit(); 517 nmi_exit();
518
519 /* On i386, may loop back to preprocess */
520 nmi_nesting_postprocess();
419} 521}
420 522
421void stop_nmi(void) 523void stop_nmi(void)
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..0d01a8ea4e11
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,180 @@
1/*
2 * arch/x86/kernel/nmi-selftest.c
3 *
4 * Testsuite for NMI: IPIs
5 *
6 * Started by Don Zickus:
7 * (using lib/locking-selftest.c as a guide)
8 *
9 * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
10 */
11
12#include <linux/smp.h>
13#include <linux/cpumask.h>
14#include <linux/delay.h>
15
16#include <asm/apic.h>
17#include <asm/nmi.h>
18
19#define SUCCESS 0
20#define FAILURE 1
21#define TIMEOUT 2
22
23static int nmi_fail;
24
25/* check to see if NMI IPIs work on this machine */
26static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
27
28static int testcase_total;
29static int testcase_successes;
30static int expected_testcase_failures;
31static int unexpected_testcase_failures;
32static int unexpected_testcase_unknowns;
33
34static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
35{
36 unexpected_testcase_unknowns++;
37 return NMI_HANDLED;
38}
39
40static void init_nmi_testsuite(void)
41{
42 /* trap all the unknown NMIs we may generate */
43 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
44}
45
46static void cleanup_nmi_testsuite(void)
47{
48 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
49}
50
51static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
52{
53 int cpu = raw_smp_processor_id();
54
55 if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
56 return NMI_HANDLED;
57
58 return NMI_DONE;
59}
60
61static void test_nmi_ipi(struct cpumask *mask)
62{
63 unsigned long timeout;
64
65 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
66 NMI_FLAG_FIRST, "nmi_selftest")) {
67 nmi_fail = FAILURE;
68 return;
69 }
70
71 /* sync above data before sending NMI */
72 wmb();
73
74 apic->send_IPI_mask(mask, NMI_VECTOR);
75
76 /* Don't wait longer than a second */
77 timeout = USEC_PER_SEC;
78 while (!cpumask_empty(mask) && timeout--)
79 udelay(1);
80
81 /* What happens if we timeout, do we still unregister?? */
82 unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
83
84 if (!timeout)
85 nmi_fail = TIMEOUT;
86 return;
87}
88
89static void remote_ipi(void)
90{
91 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
92 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
93 if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
94 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
95}
96
97static void local_ipi(void)
98{
99 cpumask_clear(to_cpumask(nmi_ipi_mask));
100 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
101 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
102}
103
104static void reset_nmi(void)
105{
106 nmi_fail = 0;
107}
108
109static void dotest(void (*testcase_fn)(void), int expected)
110{
111 testcase_fn();
112 /*
113 * Filter out expected failures:
114 */
115 if (nmi_fail != expected) {
116 unexpected_testcase_failures++;
117
118 if (nmi_fail == FAILURE)
119 printk("FAILED |");
120 else if (nmi_fail == TIMEOUT)
121 printk("TIMEOUT|");
122 else
123 printk("ERROR |");
124 dump_stack();
125 } else {
126 testcase_successes++;
127 printk(" ok |");
128 }
129 testcase_total++;
130
131 reset_nmi();
132}
133
134static inline void print_testname(const char *testname)
135{
136 printk("%12s:", testname);
137}
138
139void nmi_selftest(void)
140{
141 init_nmi_testsuite();
142
143 /*
144 * Run the testsuite:
145 */
146 printk("----------------\n");
147 printk("| NMI testsuite:\n");
148 printk("--------------------\n");
149
150 print_testname("remote IPI");
151 dotest(remote_ipi, SUCCESS);
152 printk("\n");
153 print_testname("local IPI");
154 dotest(local_ipi, SUCCESS);
155 printk("\n");
156
157 cleanup_nmi_testsuite();
158
159 if (unexpected_testcase_failures) {
160 printk("--------------------\n");
161 printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
162 unexpected_testcase_failures, testcase_total);
163 printk("-----------------------------------------------------------------\n");
164 } else if (expected_testcase_failures && testcase_successes) {
165 printk("--------------------\n");
166 printk("%3d out of %3d testcases failed, as expected. |\n",
167 expected_testcase_failures, testcase_total);
168 printk("----------------------------------------------------\n");
169 } else if (expected_testcase_failures && !testcase_successes) {
170 printk("--------------------\n");
171 printk("All %3d testcases failed, as expected. |\n",
172 expected_testcase_failures);
173 printk("----------------------------------------\n");
174 } else {
175 printk("--------------------\n");
176 printk("Good, all %3d testcases passed! |\n",
177 testcase_successes);
178 printk("---------------------------------\n");
179 }
180}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 80dc793b3f63..1c4d769e21ea 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;
45 */ 45 */
46int iommu_pass_through __read_mostly; 46int iommu_pass_through __read_mostly;
47 47
48/*
49 * Group multi-function PCI devices into a single device-group for the
50 * iommu_device_group interface. This tells the iommu driver to pretend
51 * it cannot distinguish between functions of a device, exposing only one
52 * group for the device. Useful for disallowing use of individual PCI
53 * functions from userspace drivers.
54 */
55int iommu_group_mf __read_mostly;
56
48extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; 57extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
49 58
50/* Dummy device used for NULL arguments (normally ISA). */ 59/* Dummy device used for NULL arguments (normally ISA). */
@@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)
169#endif 178#endif
170 if (!strncmp(p, "pt", 2)) 179 if (!strncmp(p, "pt", 2))
171 iommu_pass_through = 1; 180 iommu_pass_through = 1;
181 if (!strncmp(p, "group_mf", 8))
182 iommu_group_mf = 1;
172 183
173 gart_parse_options(p); 184 gart_parse_options(p);
174 185
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b9b3b1a51643..15763af7bfe3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
293 regs.orig_ax = -1; 293 regs.orig_ax = -1;
294 regs.ip = (unsigned long) kernel_thread_helper; 294 regs.ip = (unsigned long) kernel_thread_helper;
295 regs.cs = __KERNEL_CS | get_kernel_rpl(); 295 regs.cs = __KERNEL_CS | get_kernel_rpl();
296 regs.flags = X86_EFLAGS_IF | 0x2; 296 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
297 297
298 /* Ok, create the new process.. */ 298 /* Ok, create the new process.. */
299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -403,6 +403,14 @@ void default_idle(void)
403EXPORT_SYMBOL(default_idle); 403EXPORT_SYMBOL(default_idle);
404#endif 404#endif
405 405
406bool set_pm_idle_to_default(void)
407{
408 bool ret = !!pm_idle;
409
410 pm_idle = default_idle;
411
412 return ret;
413}
406void stop_this_cpu(void *dummy) 414void stop_this_cpu(void *dummy)
407{ 415{
408 local_irq_disable(); 416 local_irq_disable();
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 795b79f984c2..485204f58cda 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -99,7 +99,8 @@ void cpu_idle(void)
99 99
100 /* endless idle loop with no priority at all */ 100 /* endless idle loop with no priority at all */
101 while (1) { 101 while (1) {
102 tick_nohz_stop_sched_tick(1); 102 tick_nohz_idle_enter();
103 rcu_idle_enter();
103 while (!need_resched()) { 104 while (!need_resched()) {
104 105
105 check_pgt_cache(); 106 check_pgt_cache();
@@ -116,7 +117,8 @@ void cpu_idle(void)
116 pm_idle(); 117 pm_idle();
117 start_critical_timings(); 118 start_critical_timings();
118 } 119 }
119 tick_nohz_restart_sched_tick(); 120 rcu_idle_exit();
121 tick_nohz_idle_exit();
120 preempt_enable_no_resched(); 122 preempt_enable_no_resched();
121 schedule(); 123 schedule();
122 preempt_disable(); 124 preempt_disable();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3bd7e6eebf31..9b9fe4a85c87 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -122,7 +122,7 @@ void cpu_idle(void)
122 122
123 /* endless idle loop with no priority at all */ 123 /* endless idle loop with no priority at all */
124 while (1) { 124 while (1) {
125 tick_nohz_stop_sched_tick(1); 125 tick_nohz_idle_enter();
126 while (!need_resched()) { 126 while (!need_resched()) {
127 127
128 rmb(); 128 rmb();
@@ -139,8 +139,14 @@ void cpu_idle(void)
139 enter_idle(); 139 enter_idle();
140 /* Don't trace irqs off for idle */ 140 /* Don't trace irqs off for idle */
141 stop_critical_timings(); 141 stop_critical_timings();
142
143 /* enter_idle() needs rcu for notifiers */
144 rcu_idle_enter();
145
142 if (cpuidle_idle_call()) 146 if (cpuidle_idle_call())
143 pm_idle(); 147 pm_idle();
148
149 rcu_idle_exit();
144 start_critical_timings(); 150 start_critical_timings();
145 151
146 /* In many cases the interrupt that ended idle 152 /* In many cases the interrupt that ended idle
@@ -149,7 +155,7 @@ void cpu_idle(void)
149 __exit_idle(); 155 __exit_idle();
150 } 156 }
151 157
152 tick_nohz_restart_sched_tick(); 158 tick_nohz_idle_exit();
153 preempt_enable_no_resched(); 159 preempt_enable_no_resched();
154 schedule(); 160 schedule();
155 preempt_disable(); 161 preempt_disable();
@@ -293,13 +299,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
293 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 299 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
294 300
295 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 301 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
296 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 302 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
303 IO_BITMAP_BYTES, GFP_KERNEL);
297 if (!p->thread.io_bitmap_ptr) { 304 if (!p->thread.io_bitmap_ptr) {
298 p->thread.io_bitmap_max = 0; 305 p->thread.io_bitmap_max = 0;
299 return -ENOMEM; 306 return -ENOMEM;
300 } 307 }
301 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
302 IO_BITMAP_BYTES);
303 set_tsk_thread_flag(p, TIF_IO_BITMAP); 308 set_tsk_thread_flag(p, TIF_IO_BITMAP);
304 } 309 }
305 310
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 82528799c5de..50267386b766 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -749,7 +749,8 @@ put:
749/* 749/*
750 * Handle PTRACE_POKEUSR calls for the debug register area. 750 * Handle PTRACE_POKEUSR calls for the debug register area.
751 */ 751 */
752int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) 752static int ptrace_set_debugreg(struct task_struct *tsk, int n,
753 unsigned long val)
753{ 754{
754 struct thread_struct *thread = &(tsk->thread); 755 struct thread_struct *thread = &(tsk->thread);
755 int rc = 0; 756 int rc = 0;
@@ -1391,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
1391 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1392 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1392 trace_sys_enter(regs, regs->orig_ax); 1393 trace_sys_enter(regs, regs->orig_ax);
1393 1394
1394 if (unlikely(current->audit_context)) { 1395 if (IS_IA32)
1395 if (IS_IA32) 1396 audit_syscall_entry(AUDIT_ARCH_I386,
1396 audit_syscall_entry(AUDIT_ARCH_I386, 1397 regs->orig_ax,
1397 regs->orig_ax, 1398 regs->bx, regs->cx,
1398 regs->bx, regs->cx, 1399 regs->dx, regs->si);
1399 regs->dx, regs->si);
1400#ifdef CONFIG_X86_64 1400#ifdef CONFIG_X86_64
1401 else 1401 else
1402 audit_syscall_entry(AUDIT_ARCH_X86_64, 1402 audit_syscall_entry(AUDIT_ARCH_X86_64,
1403 regs->orig_ax, 1403 regs->orig_ax,
1404 regs->di, regs->si, 1404 regs->di, regs->si,
1405 regs->dx, regs->r10); 1405 regs->dx, regs->r10);
1406#endif 1406#endif
1407 }
1408 1407
1409 return ret ?: regs->orig_ax; 1408 return ret ?: regs->orig_ax;
1410} 1409}
@@ -1413,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs)
1413{ 1412{
1414 bool step; 1413 bool step;
1415 1414
1416 if (unlikely(current->audit_context)) 1415 audit_syscall_exit(regs);
1417 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1418 1416
1419 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1417 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1420 trace_sys_exit(regs, regs->ax); 1418 trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index b78643d0f9a5..03920a15a632 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -553,4 +553,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
553 quirk_amd_nb_node); 553 quirk_amd_nb_node);
554DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, 554DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
555 quirk_amd_nb_node); 555 quirk_amd_nb_node);
556DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0,
557 quirk_amd_nb_node);
558DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1,
559 quirk_amd_nb_node);
560DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2,
561 quirk_amd_nb_node);
562DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3,
563 quirk_amd_nb_node);
564DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4,
565 quirk_amd_nb_node);
566DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
567 quirk_amd_nb_node);
568
556#endif 569#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e334be1182b9..37a458b521a6 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -124,7 +124,7 @@ __setup("reboot=", reboot_setup);
124 */ 124 */
125 125
126/* 126/*
127 * Some machines require the "reboot=b" commandline option, 127 * Some machines require the "reboot=b" or "reboot=k" commandline options,
128 * this quirk makes that automatic. 128 * this quirk makes that automatic.
129 */ 129 */
130static int __init set_bios_reboot(const struct dmi_system_id *d) 130static int __init set_bios_reboot(const struct dmi_system_id *d)
@@ -136,6 +136,15 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
136 return 0; 136 return 0;
137} 137}
138 138
139static int __init set_kbd_reboot(const struct dmi_system_id *d)
140{
141 if (reboot_type != BOOT_KBD) {
142 reboot_type = BOOT_KBD;
143 printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident);
144 }
145 return 0;
146}
147
139static struct dmi_system_id __initdata reboot_dmi_table[] = { 148static struct dmi_system_id __initdata reboot_dmi_table[] = {
140 { /* Handle problems with rebooting on Dell E520's */ 149 { /* Handle problems with rebooting on Dell E520's */
141 .callback = set_bios_reboot, 150 .callback = set_bios_reboot,
@@ -295,7 +304,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
295 }, 304 },
296 }, 305 },
297 { /* Handle reboot issue on Acer Aspire one */ 306 { /* Handle reboot issue on Acer Aspire one */
298 .callback = set_bios_reboot, 307 .callback = set_kbd_reboot,
299 .ident = "Acer Aspire One A110", 308 .ident = "Acer Aspire One A110",
300 .matches = { 309 .matches = {
301 DMI_MATCH(DMI_SYS_VENDOR, "Acer"), 310 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
@@ -443,6 +452,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
443 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), 452 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
444 }, 453 },
445 }, 454 },
455 { /* Handle problems with rebooting on the OptiPlex 990. */
456 .callback = set_pci_reboot,
457 .ident = "Dell OptiPlex 990",
458 .matches = {
459 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
460 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
461 },
462 },
446 { } 463 { }
447}; 464};
448 465
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 348ce016a835..af6db6ec5b2a 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -12,6 +12,7 @@
12#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
13#include <asm/x86_init.h> 13#include <asm/x86_init.h>
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/mrst.h>
15 16
16#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
17/* 18/*
@@ -242,6 +243,10 @@ static __init int add_rtc_cmos(void)
242 if (of_have_populated_dt()) 243 if (of_have_populated_dt())
243 return 0; 244 return 0;
244 245
246 /* Intel MID platforms don't have ioport rtc */
247 if (mrst_identify_cpu())
248 return -ENODEV;
249
245 platform_device_register(&rtc_device); 250 platform_device_register(&rtc_device);
246 dev_info(&rtc_device.dev, 251 dev_info(&rtc_device.dev,
247 "registered platform RTC device (no PNP device found)\n"); 252 "registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d05444ac2aea..d7d5099fe874 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -749,12 +749,7 @@ void __init setup_arch(char **cmdline_p)
749#endif 749#endif
750#ifdef CONFIG_EFI 750#ifdef CONFIG_EFI
751 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 751 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
752#ifdef CONFIG_X86_32 752 EFI_LOADER_SIGNATURE, 4)) {
753 "EL32",
754#else
755 "EL64",
756#endif
757 4)) {
758 efi_enabled = 1; 753 efi_enabled = 1;
759 efi_memblock_x86_reserve_range(); 754 efi_memblock_x86_reserve_range();
760 } 755 }
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c1..46a01bdc27e2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
683 struct pt_regs *regs) 683 struct pt_regs *regs)
684{ 684{
685 sigset_t blocked;
686 int ret; 685 int ret;
687 686
688 /* Are we from a system call? */ 687 /* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
733 */ 732 */
734 regs->flags &= ~X86_EFLAGS_TF; 733 regs->flags &= ~X86_EFLAGS_TF;
735 734
736 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask); 735 block_sigmask(ka, sig);
737 if (!(ka->sa.sa_flags & SA_NODEFER))
738 sigaddset(&blocked, sig);
739 set_current_blocked(&blocked);
740 736
741 tracehook_signal_handler(sig, info, ka, regs, 737 tracehook_signal_handler(sig, info, ka, regs,
742 test_thread_flag(TIF_SINGLESTEP)); 738 test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc15484..66c74f481cab 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
29#include <asm/mmu_context.h> 29#include <asm/mmu_context.h>
30#include <asm/proto.h> 30#include <asm/proto.h>
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/nmi.h>
32/* 33/*
33 * Some notes on x86 processor bugs affecting SMP operation: 34 * Some notes on x86 processor bugs affecting SMP operation:
34 * 35 *
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
148 free_cpumask_var(allbutself); 149 free_cpumask_var(allbutself);
149} 150}
150 151
152static atomic_t stopping_cpu = ATOMIC_INIT(-1);
153
154static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
155{
156 /* We are registered on stopping cpu too, avoid spurious NMI */
157 if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
158 return NMI_HANDLED;
159
160 stop_this_cpu(NULL);
161
162 return NMI_HANDLED;
163}
164
165static void native_nmi_stop_other_cpus(int wait)
166{
167 unsigned long flags;
168 unsigned long timeout;
169
170 if (reboot_force)
171 return;
172
173 /*
174 * Use an own vector here because smp_call_function
175 * does lots of things not suitable in a panic situation.
176 */
177 if (num_online_cpus() > 1) {
178 /* did someone beat us here? */
179 if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
180 return;
181
182 if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
183 NMI_FLAG_FIRST, "smp_stop"))
184 /* Note: we ignore failures here */
185 return;
186
187 /* sync above data before sending NMI */
188 wmb();
189
190 apic->send_IPI_allbutself(NMI_VECTOR);
191
192 /*
193 * Don't wait longer than a second if the caller
194 * didn't ask us to wait.
195 */
196 timeout = USEC_PER_SEC;
197 while (num_online_cpus() > 1 && (wait || timeout--))
198 udelay(1);
199 }
200
201 local_irq_save(flags);
202 disable_local_APIC();
203 local_irq_restore(flags);
204}
205
151/* 206/*
152 * this function calls the 'stop' function on all other CPUs in the system. 207 * this function calls the 'stop' function on all other CPUs in the system.
153 */ 208 */
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
160 irq_exit(); 215 irq_exit();
161} 216}
162 217
163static void native_stop_other_cpus(int wait) 218static void native_irq_stop_other_cpus(int wait)
164{ 219{
165 unsigned long flags; 220 unsigned long flags;
166 unsigned long timeout; 221 unsigned long timeout;
@@ -194,6 +249,11 @@ static void native_stop_other_cpus(int wait)
194 local_irq_restore(flags); 249 local_irq_restore(flags);
195} 250}
196 251
252static void native_smp_disable_nmi_ipi(void)
253{
254 smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
255}
256
197/* 257/*
198 * Reschedule call back. 258 * Reschedule call back.
199 */ 259 */
@@ -225,12 +285,20 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
225 irq_exit(); 285 irq_exit();
226} 286}
227 287
288static int __init nonmi_ipi_setup(char *str)
289{
290 native_smp_disable_nmi_ipi();
291 return 1;
292}
293
294__setup("nonmi_ipi", nonmi_ipi_setup);
295
228struct smp_ops smp_ops = { 296struct smp_ops smp_ops = {
229 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 297 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
230 .smp_prepare_cpus = native_smp_prepare_cpus, 298 .smp_prepare_cpus = native_smp_prepare_cpus,
231 .smp_cpus_done = native_smp_cpus_done, 299 .smp_cpus_done = native_smp_cpus_done,
232 300
233 .stop_other_cpus = native_stop_other_cpus, 301 .stop_other_cpus = native_nmi_stop_other_cpus,
234 .smp_send_reschedule = native_smp_send_reschedule, 302 .smp_send_reschedule = native_smp_send_reschedule,
235 303
236 .cpu_up = native_cpu_up, 304 .cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..66d250c00d11 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void)
207 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
208 */ 208 */
209 setup_vector_irq(smp_processor_id()); 209 setup_vector_irq(smp_processor_id());
210
211 /*
212 * Save our processor parameters. Note: this information
213 * is needed for clock calibration.
214 */
215 smp_store_cpu_info(cpuid);
216
210 /* 217 /*
211 * Get our bogomips. 218 * Get our bogomips.
219 * Update loops_per_jiffy in cpu_data. Previous call to
220 * smp_store_cpu_info() stored a value that is close but not as
221 * accurate as the value just calculated.
212 * 222 *
213 * Need to enable IRQs because it can take longer and then 223 * Need to enable IRQs because it can take longer and then
214 * the NMI watchdog might kill us. 224 * the NMI watchdog might kill us.
215 */ 225 */
216 local_irq_enable(); 226 local_irq_enable();
217 calibrate_delay(); 227 calibrate_delay();
228 cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
218 local_irq_disable(); 229 local_irq_disable();
219 pr_debug("Stack at about %p\n", &cpuid); 230 pr_debug("Stack at about %p\n", &cpuid);
220 231
221 /* 232 /*
222 * Save our processor parameters
223 */
224 smp_store_cpu_info(cpuid);
225
226 /*
227 * This must be done before setting cpu_online_mask 233 * This must be done before setting cpu_online_mask
228 * or calling notify_cpu_starting. 234 * or calling notify_cpu_starting.
229 */ 235 */
@@ -840,7 +846,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
840 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); 846 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
841 847
842 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 848 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
843 !physid_isset(apicid, phys_cpu_present_map)) { 849 !physid_isset(apicid, phys_cpu_present_map) ||
850 (!x2apic_mode && apicid >= 255)) {
844 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); 851 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
845 return -EINVAL; 852 return -EINVAL;
846 } 853 }
@@ -1142,6 +1149,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1142{ 1149{
1143 pr_debug("Boot done.\n"); 1150 pr_debug("Boot done.\n");
1144 1151
1152 nmi_selftest();
1145 impress_friends(); 1153 impress_friends();
1146#ifdef CONFIG_X86_IO_APIC 1154#ifdef CONFIG_X86_IO_APIC
1147 setup_ioapic_dest(); 1155 setup_ioapic_dest();
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
new file mode 100644
index 000000000000..147fcd4941c4
--- /dev/null
+++ b/arch/x86/kernel/syscall_32.c
@@ -0,0 +1,25 @@
1/* System call table for i386. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
13
14typedef asmlinkage void (*sys_call_ptr_t)(void);
15
16extern asmlinkage void sys_ni_syscall(void);
17
18const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_syscall_max] = &sys_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d6008295..7ac7943be02c 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,15 +5,11 @@
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8#define __NO_STUBS 8#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
9#include <asm/syscalls_64.h>
10#undef __SYSCALL_64
9 11
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 12#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
11#undef _ASM_X86_UNISTD_64_H
12#include <asm/unistd_64.h>
13
14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [nr] = sym,
16#undef _ASM_X86_UNISTD_64_H
17 13
18typedef void (*sys_call_ptr_t)(void); 14typedef void (*sys_call_ptr_t)(void);
19 15
@@ -21,9 +17,9 @@ extern void sys_ni_syscall(void);
21 17
22const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 18const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
23 /* 19 /*
24 *Smells like a like a compiler bug -- it doesn't work 20 * Smells like a compiler bug -- it doesn't work
25 *when the & below is removed. 21 * when the & below is removed.
26 */ 22 */
27 [0 ... __NR_syscall_max] = &sys_ni_syscall, 23 [0 ... __NR_syscall_max] = &sys_ni_syscall,
28#include <asm/unistd_64.h> 24#include <asm/syscalls_64.h>
29}; 25};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index 9a0e31293920..000000000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,350 +0,0 @@
1ENTRY(sys_call_table)
2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
3 .long sys_exit
4 .long ptregs_fork
5 .long sys_read
6 .long sys_write
7 .long sys_open /* 5 */
8 .long sys_close
9 .long sys_waitpid
10 .long sys_creat
11 .long sys_link
12 .long sys_unlink /* 10 */
13 .long ptregs_execve
14 .long sys_chdir
15 .long sys_time
16 .long sys_mknod
17 .long sys_chmod /* 15 */
18 .long sys_lchown16
19 .long sys_ni_syscall /* old break syscall holder */
20 .long sys_stat
21 .long sys_lseek
22 .long sys_getpid /* 20 */
23 .long sys_mount
24 .long sys_oldumount
25 .long sys_setuid16
26 .long sys_getuid16
27 .long sys_stime /* 25 */
28 .long sys_ptrace
29 .long sys_alarm
30 .long sys_fstat
31 .long sys_pause
32 .long sys_utime /* 30 */
33 .long sys_ni_syscall /* old stty syscall holder */
34 .long sys_ni_syscall /* old gtty syscall holder */
35 .long sys_access
36 .long sys_nice
37 .long sys_ni_syscall /* 35 - old ftime syscall holder */
38 .long sys_sync
39 .long sys_kill
40 .long sys_rename
41 .long sys_mkdir
42 .long sys_rmdir /* 40 */
43 .long sys_dup
44 .long sys_pipe
45 .long sys_times
46 .long sys_ni_syscall /* old prof syscall holder */
47 .long sys_brk /* 45 */
48 .long sys_setgid16
49 .long sys_getgid16
50 .long sys_signal
51 .long sys_geteuid16
52 .long sys_getegid16 /* 50 */
53 .long sys_acct
54 .long sys_umount /* recycled never used phys() */
55 .long sys_ni_syscall /* old lock syscall holder */
56 .long sys_ioctl
57 .long sys_fcntl /* 55 */
58 .long sys_ni_syscall /* old mpx syscall holder */
59 .long sys_setpgid
60 .long sys_ni_syscall /* old ulimit syscall holder */
61 .long sys_olduname
62 .long sys_umask /* 60 */
63 .long sys_chroot
64 .long sys_ustat
65 .long sys_dup2
66 .long sys_getppid
67 .long sys_getpgrp /* 65 */
68 .long sys_setsid
69 .long sys_sigaction
70 .long sys_sgetmask
71 .long sys_ssetmask
72 .long sys_setreuid16 /* 70 */
73 .long sys_setregid16
74 .long sys_sigsuspend
75 .long sys_sigpending
76 .long sys_sethostname
77 .long sys_setrlimit /* 75 */
78 .long sys_old_getrlimit
79 .long sys_getrusage
80 .long sys_gettimeofday
81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16
84 .long sys_old_select
85 .long sys_symlink
86 .long sys_lstat
87 .long sys_readlink /* 85 */
88 .long sys_uselib
89 .long sys_swapon
90 .long sys_reboot
91 .long sys_old_readdir
92 .long sys_old_mmap /* 90 */
93 .long sys_munmap
94 .long sys_truncate
95 .long sys_ftruncate
96 .long sys_fchmod
97 .long sys_fchown16 /* 95 */
98 .long sys_getpriority
99 .long sys_setpriority
100 .long sys_ni_syscall /* old profil syscall holder */
101 .long sys_statfs
102 .long sys_fstatfs /* 100 */
103 .long sys_ioperm
104 .long sys_socketcall
105 .long sys_syslog
106 .long sys_setitimer
107 .long sys_getitimer /* 105 */
108 .long sys_newstat
109 .long sys_newlstat
110 .long sys_newfstat
111 .long sys_uname
112 .long ptregs_iopl /* 110 */
113 .long sys_vhangup
114 .long sys_ni_syscall /* old "idle" system call */
115 .long ptregs_vm86old
116 .long sys_wait4
117 .long sys_swapoff /* 115 */
118 .long sys_sysinfo
119 .long sys_ipc
120 .long sys_fsync
121 .long ptregs_sigreturn
122 .long ptregs_clone /* 120 */
123 .long sys_setdomainname
124 .long sys_newuname
125 .long sys_modify_ldt
126 .long sys_adjtimex
127 .long sys_mprotect /* 125 */
128 .long sys_sigprocmask
129 .long sys_ni_syscall /* old "create_module" */
130 .long sys_init_module
131 .long sys_delete_module
132 .long sys_ni_syscall /* 130: old "get_kernel_syms" */
133 .long sys_quotactl
134 .long sys_getpgid
135 .long sys_fchdir
136 .long sys_bdflush
137 .long sys_sysfs /* 135 */
138 .long sys_personality
139 .long sys_ni_syscall /* reserved for afs_syscall */
140 .long sys_setfsuid16
141 .long sys_setfsgid16
142 .long sys_llseek /* 140 */
143 .long sys_getdents
144 .long sys_select
145 .long sys_flock
146 .long sys_msync
147 .long sys_readv /* 145 */
148 .long sys_writev
149 .long sys_getsid
150 .long sys_fdatasync
151 .long sys_sysctl
152 .long sys_mlock /* 150 */
153 .long sys_munlock
154 .long sys_mlockall
155 .long sys_munlockall
156 .long sys_sched_setparam
157 .long sys_sched_getparam /* 155 */
158 .long sys_sched_setscheduler
159 .long sys_sched_getscheduler
160 .long sys_sched_yield
161 .long sys_sched_get_priority_max
162 .long sys_sched_get_priority_min /* 160 */
163 .long sys_sched_rr_get_interval
164 .long sys_nanosleep
165 .long sys_mremap
166 .long sys_setresuid16
167 .long sys_getresuid16 /* 165 */
168 .long ptregs_vm86
169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll
171 .long sys_ni_syscall /* Old nfsservctl */
172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16
174 .long sys_prctl
175 .long ptregs_rt_sigreturn
176 .long sys_rt_sigaction
177 .long sys_rt_sigprocmask /* 175 */
178 .long sys_rt_sigpending
179 .long sys_rt_sigtimedwait
180 .long sys_rt_sigqueueinfo
181 .long sys_rt_sigsuspend
182 .long sys_pread64 /* 180 */
183 .long sys_pwrite64
184 .long sys_chown16
185 .long sys_getcwd
186 .long sys_capget
187 .long sys_capset /* 185 */
188 .long ptregs_sigaltstack
189 .long sys_sendfile
190 .long sys_ni_syscall /* reserved for streams1 */
191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit
194 .long sys_mmap_pgoff
195 .long sys_truncate64
196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */
198 .long sys_lstat64
199 .long sys_fstat64
200 .long sys_lchown
201 .long sys_getuid
202 .long sys_getgid /* 200 */
203 .long sys_geteuid
204 .long sys_getegid
205 .long sys_setreuid
206 .long sys_setregid
207 .long sys_getgroups /* 205 */
208 .long sys_setgroups
209 .long sys_fchown
210 .long sys_setresuid
211 .long sys_getresuid
212 .long sys_setresgid /* 210 */
213 .long sys_getresgid
214 .long sys_chown
215 .long sys_setuid
216 .long sys_setgid
217 .long sys_setfsuid /* 215 */
218 .long sys_setfsgid
219 .long sys_pivot_root
220 .long sys_mincore
221 .long sys_madvise
222 .long sys_getdents64 /* 220 */
223 .long sys_fcntl64
224 .long sys_ni_syscall /* reserved for TUX */
225 .long sys_ni_syscall
226 .long sys_gettid
227 .long sys_readahead /* 225 */
228 .long sys_setxattr
229 .long sys_lsetxattr
230 .long sys_fsetxattr
231 .long sys_getxattr
232 .long sys_lgetxattr /* 230 */
233 .long sys_fgetxattr
234 .long sys_listxattr
235 .long sys_llistxattr
236 .long sys_flistxattr
237 .long sys_removexattr /* 235 */
238 .long sys_lremovexattr
239 .long sys_fremovexattr
240 .long sys_tkill
241 .long sys_sendfile64
242 .long sys_futex /* 240 */
243 .long sys_sched_setaffinity
244 .long sys_sched_getaffinity
245 .long sys_set_thread_area
246 .long sys_get_thread_area
247 .long sys_io_setup /* 245 */
248 .long sys_io_destroy
249 .long sys_io_getevents
250 .long sys_io_submit
251 .long sys_io_cancel
252 .long sys_fadvise64 /* 250 */
253 .long sys_ni_syscall
254 .long sys_exit_group
255 .long sys_lookup_dcookie
256 .long sys_epoll_create
257 .long sys_epoll_ctl /* 255 */
258 .long sys_epoll_wait
259 .long sys_remap_file_pages
260 .long sys_set_tid_address
261 .long sys_timer_create
262 .long sys_timer_settime /* 260 */
263 .long sys_timer_gettime
264 .long sys_timer_getoverrun
265 .long sys_timer_delete
266 .long sys_clock_settime
267 .long sys_clock_gettime /* 265 */
268 .long sys_clock_getres
269 .long sys_clock_nanosleep
270 .long sys_statfs64
271 .long sys_fstatfs64
272 .long sys_tgkill /* 270 */
273 .long sys_utimes
274 .long sys_fadvise64_64
275 .long sys_ni_syscall /* sys_vserver */
276 .long sys_mbind
277 .long sys_get_mempolicy
278 .long sys_set_mempolicy
279 .long sys_mq_open
280 .long sys_mq_unlink
281 .long sys_mq_timedsend
282 .long sys_mq_timedreceive /* 280 */
283 .long sys_mq_notify
284 .long sys_mq_getsetattr
285 .long sys_kexec_load
286 .long sys_waitid
287 .long sys_ni_syscall /* 285 */ /* available */
288 .long sys_add_key
289 .long sys_request_key
290 .long sys_keyctl
291 .long sys_ioprio_set
292 .long sys_ioprio_get /* 290 */
293 .long sys_inotify_init
294 .long sys_inotify_add_watch
295 .long sys_inotify_rm_watch
296 .long sys_migrate_pages
297 .long sys_openat /* 295 */
298 .long sys_mkdirat
299 .long sys_mknodat
300 .long sys_fchownat
301 .long sys_futimesat
302 .long sys_fstatat64 /* 300 */
303 .long sys_unlinkat
304 .long sys_renameat
305 .long sys_linkat
306 .long sys_symlinkat
307 .long sys_readlinkat /* 305 */
308 .long sys_fchmodat
309 .long sys_faccessat
310 .long sys_pselect6
311 .long sys_ppoll
312 .long sys_unshare /* 310 */
313 .long sys_set_robust_list
314 .long sys_get_robust_list
315 .long sys_splice
316 .long sys_sync_file_range
317 .long sys_tee /* 315 */
318 .long sys_vmsplice
319 .long sys_move_pages
320 .long sys_getcpu
321 .long sys_epoll_pwait
322 .long sys_utimensat /* 320 */
323 .long sys_signalfd
324 .long sys_timerfd_create
325 .long sys_eventfd
326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
335 .long sys_preadv
336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open
339 .long sys_recvmmsg
340 .long sys_fanotify_init
341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */
343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
346 .long sys_syncfs
347 .long sys_sendmmsg /* 345 */
348 .long sys_setns
349 .long sys_process_vm_readv
350 .long sys_process_vm_writev
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..482ec3af2067 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,19 +306,20 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
306 == NOTIFY_STOP) 306 == NOTIFY_STOP)
307 return; 307 return;
308#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ 308#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
309#ifdef CONFIG_KPROBES 309
310 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 310 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
311 == NOTIFY_STOP) 311 == NOTIFY_STOP)
312 return; 312 return;
313#else
314 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
315 == NOTIFY_STOP)
316 return;
317#endif
318 313
314 /*
315 * Let others (NMI) know that the debug stack is in use
316 * as we may switch to the interrupt stack.
317 */
318 debug_stack_usage_inc();
319 preempt_conditional_sti(regs); 319 preempt_conditional_sti(regs);
320 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); 320 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
321 preempt_conditional_cli(regs); 321 preempt_conditional_cli(regs);
322 debug_stack_usage_dec();
322} 323}
323 324
324#ifdef CONFIG_X86_64 325#ifdef CONFIG_X86_64
@@ -411,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
411 SIGTRAP) == NOTIFY_STOP) 412 SIGTRAP) == NOTIFY_STOP)
412 return; 413 return;
413 414
415 /*
416 * Let others (NMI) know that the debug stack is in use
417 * as we may switch to the interrupt stack.
418 */
419 debug_stack_usage_inc();
420
414 /* It's safe to allow irq's after DR6 has been saved */ 421 /* It's safe to allow irq's after DR6 has been saved */
415 preempt_conditional_sti(regs); 422 preempt_conditional_sti(regs);
416 423
@@ -418,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
418 handle_vm86_trap((struct kernel_vm86_regs *) regs, 425 handle_vm86_trap((struct kernel_vm86_regs *) regs,
419 error_code, 1); 426 error_code, 1);
420 preempt_conditional_cli(regs); 427 preempt_conditional_cli(regs);
428 debug_stack_usage_dec();
421 return; 429 return;
422 } 430 }
423 431
@@ -437,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
437 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) 445 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
438 send_sigtrap(tsk, regs, error_code, si_code); 446 send_sigtrap(tsk, regs, error_code, si_code);
439 preempt_conditional_cli(regs); 447 preempt_conditional_cli(regs);
448 debug_stack_usage_dec();
440 449
441 return; 450 return;
442} 451}
@@ -723,4 +732,10 @@ void __init trap_init(void)
723 cpu_init(); 732 cpu_init();
724 733
725 x86_init.irqs.trap_init(); 734 x86_init.irqs.trap_init();
735
736#ifdef CONFIG_X86_64
737 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
738 set_nmi_gate(1, &debug);
739 set_nmi_gate(3, &int3);
740#endif
726} 741}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db483369f10b..a62c201c97ec 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable;
35 erroneous rdtsc usage on !cpu_has_tsc processors */ 35 erroneous rdtsc usage on !cpu_has_tsc processors */
36static int __read_mostly tsc_disabled = -1; 36static int __read_mostly tsc_disabled = -1;
37 37
38static int tsc_clocksource_reliable; 38int tsc_clocksource_reliable;
39/* 39/*
40 * Scheduler clock - returns current time in nanosec units. 40 * Scheduler clock - returns current time in nanosec units.
41 */ 41 */
@@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
178} 178}
179 179
180#define CAL_MS 10 180#define CAL_MS 10
181#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) 181#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
182#define CAL_PIT_LOOPS 1000 182#define CAL_PIT_LOOPS 1000
183 183
184#define CAL2_MS 50 184#define CAL2_MS 50
185#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) 185#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
186#define CAL2_PIT_LOOPS 5000 186#define CAL2_PIT_LOOPS 5000
187 187
188 188
@@ -290,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val)
290static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) 290static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
291{ 291{
292 int count; 292 int count;
293 u64 tsc = 0; 293 u64 tsc = 0, prev_tsc = 0;
294 294
295 for (count = 0; count < 50000; count++) { 295 for (count = 0; count < 50000; count++) {
296 if (!pit_verify_msb(val)) 296 if (!pit_verify_msb(val))
297 break; 297 break;
298 prev_tsc = tsc;
298 tsc = get_cycles(); 299 tsc = get_cycles();
299 } 300 }
300 *deltap = get_cycles() - tsc; 301 *deltap = get_cycles() - prev_tsc;
301 *tscp = tsc; 302 *tscp = tsc;
302 303
303 /* 304 /*
@@ -311,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
311 * How many MSB values do we want to see? We aim for 312 * How many MSB values do we want to see? We aim for
312 * a maximum error rate of 500ppm (in practice the 313 * a maximum error rate of 500ppm (in practice the
313 * real error is much smaller), but refuse to spend 314 * real error is much smaller), but refuse to spend
314 * more than 25ms on it. 315 * more than 50ms on it.
315 */ 316 */
316#define MAX_QUICK_PIT_MS 25 317#define MAX_QUICK_PIT_MS 50
317#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) 318#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
318 319
319static unsigned long quick_pit_calibrate(void) 320static unsigned long quick_pit_calibrate(void)
@@ -383,15 +384,12 @@ success:
383 * 384 *
384 * As a result, we can depend on there not being 385 * As a result, we can depend on there not being
385 * any odd delays anywhere, and the TSC reads are 386 * any odd delays anywhere, and the TSC reads are
386 * reliable (within the error). We also adjust the 387 * reliable (within the error).
387 * delta to the middle of the error bars, just
388 * because it looks nicer.
389 * 388 *
390 * kHz = ticks / time-in-seconds / 1000; 389 * kHz = ticks / time-in-seconds / 1000;
391 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 390 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
392 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) 391 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
393 */ 392 */
394 delta += (long)(d2 - d1)/2;
395 delta *= PIT_TICK_RATE; 393 delta *= PIT_TICK_RATE;
396 do_div(delta, i*256*1000); 394 do_div(delta, i*256*1000);
397 printk("Fast TSC calibration using PIT\n"); 395 printk("Fast TSC calibration using PIT\n");
@@ -995,3 +993,23 @@ void __init tsc_init(void)
995 check_system_tsc_reliable(); 993 check_system_tsc_reliable();
996} 994}
997 995
996#ifdef CONFIG_SMP
997/*
998 * If we have a constant TSC and are using the TSC for the delay loop,
999 * we can skip clock calibration if another cpu in the same socket has already
1000 * been calibrated. This assumes that CONSTANT_TSC applies to all
1001 * cpus in the socket - this should be a safe assumption.
1002 */
1003unsigned long __cpuinit calibrate_delay_is_known(void)
1004{
1005 int i, cpu = smp_processor_id();
1006
1007 if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
1008 return 0;
1009
1010 for_each_online_cpu(i)
1011 if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
1012 return cpu_data(i).loops_per_jiffy;
1013 return 0;
1014}
1015#endif
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0aa5fed8b9e6..9eba29b46cb7 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 if (unsynchronized_tsc()) 113 if (unsynchronized_tsc())
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (tsc_clocksource_reliable) {
117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info( 118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n"); 119 "Skipped synchronization checks as TSC is reliable.\n");
@@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)
172{ 172{
173 int cpus = 2; 173 int cpus = 2;
174 174
175 if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) 175 if (unsynchronized_tsc() || tsc_clocksource_reliable)
176 return; 176 return;
177 177
178 /* 178 /*
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f8753ab0a..b466cab5ba15 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
335 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
336 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
337 337
338 /*call audit_syscall_exit since we do not exit via the normal paths */ 338 /*call __audit_syscall_exit since we do not exit via the normal paths */
339#ifdef CONFIG_AUDITSYSCALL
339 if (unlikely(current->audit_context)) 340 if (unlikely(current->audit_context))
340 audit_syscall_exit(AUDITSC_RESULT(0), 0); 341 __audit_syscall_exit(1, 0);
342#endif
341 343
342 __asm__ __volatile__( 344 __asm__ __volatile__(
343 "movl %0,%%esp\n\t" 345 "movl %0,%%esp\n\t"
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e4d4a22e8b94..b07ba9393564 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
57 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), 57 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
58}; 58};
59 59
60static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; 60static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
61 61
62static int __init vsyscall_setup(char *str) 62static int __init vsyscall_setup(char *str)
63{ 63{
@@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)
140 return nr; 140 return nr;
141} 141}
142 142
143static bool write_ok_or_segv(unsigned long ptr, size_t size)
144{
145 /*
146 * XXX: if access_ok, get_user, and put_user handled
147 * sig_on_uaccess_error, this could go away.
148 */
149
150 if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
151 siginfo_t info;
152 struct thread_struct *thread = &current->thread;
153
154 thread->error_code = 6; /* user fault, no page, write */
155 thread->cr2 = ptr;
156 thread->trap_no = 14;
157
158 memset(&info, 0, sizeof(info));
159 info.si_signo = SIGSEGV;
160 info.si_errno = 0;
161 info.si_code = SEGV_MAPERR;
162 info.si_addr = (void __user *)ptr;
163
164 force_sig_info(SIGSEGV, &info, current);
165 return false;
166 } else {
167 return true;
168 }
169}
170
143bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) 171bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
144{ 172{
145 struct task_struct *tsk; 173 struct task_struct *tsk;
146 unsigned long caller; 174 unsigned long caller;
147 int vsyscall_nr; 175 int vsyscall_nr;
176 int prev_sig_on_uaccess_error;
148 long ret; 177 long ret;
149 178
150 /* 179 /*
@@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
180 if (seccomp_mode(&tsk->seccomp)) 209 if (seccomp_mode(&tsk->seccomp))
181 do_exit(SIGKILL); 210 do_exit(SIGKILL);
182 211
212 /*
213 * With a real vsyscall, page faults cause SIGSEGV. We want to
214 * preserve that behavior to make writing exploits harder.
215 */
216 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
217 current_thread_info()->sig_on_uaccess_error = 1;
218
219 /*
220 * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
221 * 64-bit, so we don't need to special-case it here. For all the
222 * vsyscalls, 0 means "don't write anything" not "write it at
223 * address 0".
224 */
225 ret = -EFAULT;
183 switch (vsyscall_nr) { 226 switch (vsyscall_nr) {
184 case 0: 227 case 0:
228 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
229 !write_ok_or_segv(regs->si, sizeof(struct timezone)))
230 break;
231
185 ret = sys_gettimeofday( 232 ret = sys_gettimeofday(
186 (struct timeval __user *)regs->di, 233 (struct timeval __user *)regs->di,
187 (struct timezone __user *)regs->si); 234 (struct timezone __user *)regs->si);
188 break; 235 break;
189 236
190 case 1: 237 case 1:
238 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
239 break;
240
191 ret = sys_time((time_t __user *)regs->di); 241 ret = sys_time((time_t __user *)regs->di);
192 break; 242 break;
193 243
194 case 2: 244 case 2:
245 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
246 !write_ok_or_segv(regs->si, sizeof(unsigned)))
247 break;
248
195 ret = sys_getcpu((unsigned __user *)regs->di, 249 ret = sys_getcpu((unsigned __user *)regs->di,
196 (unsigned __user *)regs->si, 250 (unsigned __user *)regs->si,
197 0); 251 0);
198 break; 252 break;
199 } 253 }
200 254
255 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
256
201 if (ret == -EFAULT) { 257 if (ret == -EFAULT) {
202 /* 258 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
203 * Bad news -- userspace fed a bad pointer to a vsyscall.
204 *
205 * With a real vsyscall, that would have caused SIGSEGV.
206 * To make writing reliable exploits using the emulated
207 * vsyscalls harder, generate SIGSEGV here as well.
208 */
209 warn_bad_vsyscall(KERN_INFO, regs, 259 warn_bad_vsyscall(KERN_INFO, regs,
210 "vsyscall fault (exploit attempt?)"); 260 "vsyscall fault (exploit attempt?)");
211 goto sigsegv; 261
262 /*
263 * If we failed to generate a signal for any reason,
264 * generate one here. (This should be impossible.)
265 */
266 if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
267 !sigismember(&tsk->pending.signal, SIGSEGV)))
268 goto sigsegv;
269
270 return true; /* Don't emulate the ret. */
212 } 271 }
213 272
214 regs->ax = ret; 273 regs->ax = ret;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c1d6cd549397..947a06ccc673 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {
92 92
93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
94 .setup_percpu_clockev = setup_secondary_APIC_clock, 94 .setup_percpu_clockev = setup_secondary_APIC_clock,
95 .fixup_cpu_id = x86_default_fixup_cpu_id,
95}; 96};
96 97
97static void default_nmi_init(void) { }; 98static void default_nmi_init(void) { };
@@ -114,4 +115,5 @@ struct x86_msi_ops x86_msi = {
114 .setup_msi_irqs = native_setup_msi_irqs, 115 .setup_msi_irqs = native_setup_msi_irqs,
115 .teardown_msi_irq = native_teardown_msi_irq, 116 .teardown_msi_irq = native_teardown_msi_irq,
116 .teardown_msi_irqs = default_teardown_msi_irqs, 117 .teardown_msi_irqs = default_teardown_msi_irqs,
118 .restore_msi_irqs = default_restore_msi_irqs,
117}; 119};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ff5790d8e990..1a7fe868f375 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -35,6 +35,7 @@ config KVM
35 select KVM_MMIO 35 select KVM_MMIO
36 select TASKSTATS 36 select TASKSTATS
37 select TASK_DELAY_ACCT 37 select TASK_DELAY_ACCT
38 select PERF_EVENTS
38 ---help--- 39 ---help---
39 Support hosting fully virtualized guest machines using hardware 40 Support hosting fully virtualized guest machines using hardware
40 virtualization extensions. You will need a fairly recent 41 virtualization extensions. You will need a fairly recent
@@ -52,6 +53,8 @@ config KVM
52config KVM_INTEL 53config KVM_INTEL
53 tristate "KVM for Intel processors support" 54 tristate "KVM for Intel processors support"
54 depends on KVM 55 depends on KVM
56 # for perf_guest_get_msrs():
57 depends on CPU_SUP_INTEL
55 ---help--- 58 ---help---
56 Provides support for KVM on Intel processors equipped with the VT 59 Provides support for KVM on Intel processors equipped with the VT
57 extensions. 60 extensions.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f431c8..4f579e8dcacf 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
13 13
14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
15 i8254.o timer.o 15 i8254.o timer.o cpuid.o pmu.o
16kvm-intel-y += vmx.o 16kvm-intel-y += vmx.o
17kvm-amd-y += svm.o 17kvm-amd-y += svm.o
18 18
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000000..89b02bfaaca5
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,670 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 * cpuid support routines
4 *
5 * derived from arch/x86/kvm/x86.c
6 *
7 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
8 * Copyright IBM Corporation, 2008
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
12 *
13 */
14
15#include <linux/kvm_host.h>
16#include <linux/module.h>
17#include <linux/vmalloc.h>
18#include <linux/uaccess.h>
19#include <asm/user.h>
20#include <asm/xsave.h>
21#include "cpuid.h"
22#include "lapic.h"
23#include "mmu.h"
24#include "trace.h"
25
26void kvm_update_cpuid(struct kvm_vcpu *vcpu)
27{
28 struct kvm_cpuid_entry2 *best;
29 struct kvm_lapic *apic = vcpu->arch.apic;
30
31 best = kvm_find_cpuid_entry(vcpu, 1, 0);
32 if (!best)
33 return;
34
35 /* Update OSXSAVE bit */
36 if (cpu_has_xsave && best->function == 0x1) {
37 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
38 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
39 best->ecx |= bit(X86_FEATURE_OSXSAVE);
40 }
41
42 if (apic) {
43 if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
44 apic->lapic_timer.timer_mode_mask = 3 << 17;
45 else
46 apic->lapic_timer.timer_mode_mask = 1 << 17;
47 }
48
49 kvm_pmu_cpuid_update(vcpu);
50}
51
52static int is_efer_nx(void)
53{
54 unsigned long long efer = 0;
55
56 rdmsrl_safe(MSR_EFER, &efer);
57 return efer & EFER_NX;
58}
59
60static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
61{
62 int i;
63 struct kvm_cpuid_entry2 *e, *entry;
64
65 entry = NULL;
66 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
67 e = &vcpu->arch.cpuid_entries[i];
68 if (e->function == 0x80000001) {
69 entry = e;
70 break;
71 }
72 }
73 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
74 entry->edx &= ~(1 << 20);
75 printk(KERN_INFO "kvm: guest NX capability removed\n");
76 }
77}
78
79/* when an old userspace process fills a new kernel module */
80int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
81 struct kvm_cpuid *cpuid,
82 struct kvm_cpuid_entry __user *entries)
83{
84 int r, i;
85 struct kvm_cpuid_entry *cpuid_entries;
86
87 r = -E2BIG;
88 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
89 goto out;
90 r = -ENOMEM;
91 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
92 if (!cpuid_entries)
93 goto out;
94 r = -EFAULT;
95 if (copy_from_user(cpuid_entries, entries,
96 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
97 goto out_free;
98 for (i = 0; i < cpuid->nent; i++) {
99 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
100 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
101 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
102 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
103 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
104 vcpu->arch.cpuid_entries[i].index = 0;
105 vcpu->arch.cpuid_entries[i].flags = 0;
106 vcpu->arch.cpuid_entries[i].padding[0] = 0;
107 vcpu->arch.cpuid_entries[i].padding[1] = 0;
108 vcpu->arch.cpuid_entries[i].padding[2] = 0;
109 }
110 vcpu->arch.cpuid_nent = cpuid->nent;
111 cpuid_fix_nx_cap(vcpu);
112 r = 0;
113 kvm_apic_set_version(vcpu);
114 kvm_x86_ops->cpuid_update(vcpu);
115 kvm_update_cpuid(vcpu);
116
117out_free:
118 vfree(cpuid_entries);
119out:
120 return r;
121}
122
123int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
124 struct kvm_cpuid2 *cpuid,
125 struct kvm_cpuid_entry2 __user *entries)
126{
127 int r;
128
129 r = -E2BIG;
130 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
131 goto out;
132 r = -EFAULT;
133 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
134 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
135 goto out;
136 vcpu->arch.cpuid_nent = cpuid->nent;
137 kvm_apic_set_version(vcpu);
138 kvm_x86_ops->cpuid_update(vcpu);
139 kvm_update_cpuid(vcpu);
140 return 0;
141
142out:
143 return r;
144}
145
146int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
147 struct kvm_cpuid2 *cpuid,
148 struct kvm_cpuid_entry2 __user *entries)
149{
150 int r;
151
152 r = -E2BIG;
153 if (cpuid->nent < vcpu->arch.cpuid_nent)
154 goto out;
155 r = -EFAULT;
156 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
157 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
158 goto out;
159 return 0;
160
161out:
162 cpuid->nent = vcpu->arch.cpuid_nent;
163 return r;
164}
165
166static void cpuid_mask(u32 *word, int wordnum)
167{
168 *word &= boot_cpu_data.x86_capability[wordnum];
169}
170
171static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
172 u32 index)
173{
174 entry->function = function;
175 entry->index = index;
176 cpuid_count(entry->function, entry->index,
177 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
178 entry->flags = 0;
179}
180
181static bool supported_xcr0_bit(unsigned bit)
182{
183 u64 mask = ((u64)1 << bit);
184
185 return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
186}
187
188#define F(x) bit(X86_FEATURE_##x)
189
190static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
191 u32 index, int *nent, int maxnent)
192{
193 int r;
194 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
195#ifdef CONFIG_X86_64
196 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
197 ? F(GBPAGES) : 0;
198 unsigned f_lm = F(LM);
199#else
200 unsigned f_gbpages = 0;
201 unsigned f_lm = 0;
202#endif
203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
204
205 /* cpuid 1.edx */
206 const u32 kvm_supported_word0_x86_features =
207 F(FPU) | F(VME) | F(DE) | F(PSE) |
208 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
209 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
210 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
211 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
212 0 /* Reserved, DS, ACPI */ | F(MMX) |
213 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
214 0 /* HTT, TM, Reserved, PBE */;
215 /* cpuid 0x80000001.edx */
216 const u32 kvm_supported_word1_x86_features =
217 F(FPU) | F(VME) | F(DE) | F(PSE) |
218 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
219 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
220 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
221 F(PAT) | F(PSE36) | 0 /* Reserved */ |
222 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
223 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
224 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
225 /* cpuid 1.ecx */
226 const u32 kvm_supported_word4_x86_features =
227 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
228 0 /* DS-CPL, VMX, SMX, EST */ |
229 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
230 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
231 0 /* Reserved, DCA */ | F(XMM4_1) |
232 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
233 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
234 F(F16C) | F(RDRAND);
235 /* cpuid 0x80000001.ecx */
236 const u32 kvm_supported_word6_x86_features =
237 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
238 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
239 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
240 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
241
242 /* cpuid 0xC0000001.edx */
243 const u32 kvm_supported_word5_x86_features =
244 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
245 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
246 F(PMM) | F(PMM_EN);
247
248 /* cpuid 7.0.ebx */
249 const u32 kvm_supported_word9_x86_features =
250 F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
251
252 /* all calls to cpuid_count() should be made on the same cpu */
253 get_cpu();
254
255 r = -E2BIG;
256
257 if (*nent >= maxnent)
258 goto out;
259
260 do_cpuid_1_ent(entry, function, index);
261 ++*nent;
262
263 switch (function) {
264 case 0:
265 entry->eax = min(entry->eax, (u32)0xd);
266 break;
267 case 1:
268 entry->edx &= kvm_supported_word0_x86_features;
269 cpuid_mask(&entry->edx, 0);
270 entry->ecx &= kvm_supported_word4_x86_features;
271 cpuid_mask(&entry->ecx, 4);
272 /* we support x2apic emulation even if host does not support
273 * it since we emulate x2apic in software */
274 entry->ecx |= F(X2APIC);
275 break;
276 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
277 * may return different values. This forces us to get_cpu() before
278 * issuing the first command, and also to emulate this annoying behavior
279 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
280 case 2: {
281 int t, times = entry->eax & 0xff;
282
283 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
284 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
285 for (t = 1; t < times; ++t) {
286 if (*nent >= maxnent)
287 goto out;
288
289 do_cpuid_1_ent(&entry[t], function, 0);
290 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
291 ++*nent;
292 }
293 break;
294 }
295 /* function 4 has additional index. */
296 case 4: {
297 int i, cache_type;
298
299 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
300 /* read more entries until cache_type is zero */
301 for (i = 1; ; ++i) {
302 if (*nent >= maxnent)
303 goto out;
304
305 cache_type = entry[i - 1].eax & 0x1f;
306 if (!cache_type)
307 break;
308 do_cpuid_1_ent(&entry[i], function, i);
309 entry[i].flags |=
310 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
311 ++*nent;
312 }
313 break;
314 }
315 case 7: {
316 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
317 /* Mask ebx against host capbability word 9 */
318 if (index == 0) {
319 entry->ebx &= kvm_supported_word9_x86_features;
320 cpuid_mask(&entry->ebx, 9);
321 } else
322 entry->ebx = 0;
323 entry->eax = 0;
324 entry->ecx = 0;
325 entry->edx = 0;
326 break;
327 }
328 case 9:
329 break;
330 case 0xa: { /* Architectural Performance Monitoring */
331 struct x86_pmu_capability cap;
332 union cpuid10_eax eax;
333 union cpuid10_edx edx;
334
335 perf_get_x86_pmu_capability(&cap);
336
337 /*
338 * Only support guest architectural pmu on a host
339 * with architectural pmu.
340 */
341 if (!cap.version)
342 memset(&cap, 0, sizeof(cap));
343
344 eax.split.version_id = min(cap.version, 2);
345 eax.split.num_counters = cap.num_counters_gp;
346 eax.split.bit_width = cap.bit_width_gp;
347 eax.split.mask_length = cap.events_mask_len;
348
349 edx.split.num_counters_fixed = cap.num_counters_fixed;
350 edx.split.bit_width_fixed = cap.bit_width_fixed;
351 edx.split.reserved = 0;
352
353 entry->eax = eax.full;
354 entry->ebx = cap.events_mask;
355 entry->ecx = 0;
356 entry->edx = edx.full;
357 break;
358 }
359 /* function 0xb has additional index. */
360 case 0xb: {
361 int i, level_type;
362
363 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
364 /* read more entries until level_type is zero */
365 for (i = 1; ; ++i) {
366 if (*nent >= maxnent)
367 goto out;
368
369 level_type = entry[i - 1].ecx & 0xff00;
370 if (!level_type)
371 break;
372 do_cpuid_1_ent(&entry[i], function, i);
373 entry[i].flags |=
374 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
375 ++*nent;
376 }
377 break;
378 }
379 case 0xd: {
380 int idx, i;
381
382 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
383 for (idx = 1, i = 1; idx < 64; ++idx) {
384 if (*nent >= maxnent)
385 goto out;
386
387 do_cpuid_1_ent(&entry[i], function, idx);
388 if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
389 continue;
390 entry[i].flags |=
391 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
392 ++*nent;
393 ++i;
394 }
395 break;
396 }
397 case KVM_CPUID_SIGNATURE: {
398 char signature[12] = "KVMKVMKVM\0\0";
399 u32 *sigptr = (u32 *)signature;
400 entry->eax = 0;
401 entry->ebx = sigptr[0];
402 entry->ecx = sigptr[1];
403 entry->edx = sigptr[2];
404 break;
405 }
406 case KVM_CPUID_FEATURES:
407 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
408 (1 << KVM_FEATURE_NOP_IO_DELAY) |
409 (1 << KVM_FEATURE_CLOCKSOURCE2) |
410 (1 << KVM_FEATURE_ASYNC_PF) |
411 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
412
413 if (sched_info_on())
414 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
415
416 entry->ebx = 0;
417 entry->ecx = 0;
418 entry->edx = 0;
419 break;
420 case 0x80000000:
421 entry->eax = min(entry->eax, 0x8000001a);
422 break;
423 case 0x80000001:
424 entry->edx &= kvm_supported_word1_x86_features;
425 cpuid_mask(&entry->edx, 1);
426 entry->ecx &= kvm_supported_word6_x86_features;
427 cpuid_mask(&entry->ecx, 6);
428 break;
429 case 0x80000008: {
430 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
431 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
432 unsigned phys_as = entry->eax & 0xff;
433
434 if (!g_phys_as)
435 g_phys_as = phys_as;
436 entry->eax = g_phys_as | (virt_as << 8);
437 entry->ebx = entry->edx = 0;
438 break;
439 }
440 case 0x80000019:
441 entry->ecx = entry->edx = 0;
442 break;
443 case 0x8000001a:
444 break;
445 case 0x8000001d:
446 break;
447 /*Add support for Centaur's CPUID instruction*/
448 case 0xC0000000:
449 /*Just support up to 0xC0000004 now*/
450 entry->eax = min(entry->eax, 0xC0000004);
451 break;
452 case 0xC0000001:
453 entry->edx &= kvm_supported_word5_x86_features;
454 cpuid_mask(&entry->edx, 5);
455 break;
456 case 3: /* Processor serial number */
457 case 5: /* MONITOR/MWAIT */
458 case 6: /* Thermal management */
459 case 0x80000007: /* Advanced power management */
460 case 0xC0000002:
461 case 0xC0000003:
462 case 0xC0000004:
463 default:
464 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
465 break;
466 }
467
468 kvm_x86_ops->set_supported_cpuid(function, entry);
469
470 r = 0;
471
472out:
473 put_cpu();
474
475 return r;
476}
477
478#undef F
479
480struct kvm_cpuid_param {
481 u32 func;
482 u32 idx;
483 bool has_leaf_count;
484 bool (*qualifier)(struct kvm_cpuid_param *param);
485};
486
487static bool is_centaur_cpu(struct kvm_cpuid_param *param)
488{
489 return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
490}
491
492int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
493 struct kvm_cpuid_entry2 __user *entries)
494{
495 struct kvm_cpuid_entry2 *cpuid_entries;
496 int limit, nent = 0, r = -E2BIG, i;
497 u32 func;
498 static struct kvm_cpuid_param param[] = {
499 { .func = 0, .has_leaf_count = true },
500 { .func = 0x80000000, .has_leaf_count = true },
501 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
502 { .func = KVM_CPUID_SIGNATURE },
503 { .func = KVM_CPUID_FEATURES },
504 };
505
506 if (cpuid->nent < 1)
507 goto out;
508 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
509 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
510 r = -ENOMEM;
511 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
512 if (!cpuid_entries)
513 goto out;
514
515 r = 0;
516 for (i = 0; i < ARRAY_SIZE(param); i++) {
517 struct kvm_cpuid_param *ent = &param[i];
518
519 if (ent->qualifier && !ent->qualifier(ent))
520 continue;
521
522 r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
523 &nent, cpuid->nent);
524
525 if (r)
526 goto out_free;
527
528 if (!ent->has_leaf_count)
529 continue;
530
531 limit = cpuid_entries[nent - 1].eax;
532 for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
533 r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
534 &nent, cpuid->nent);
535
536 if (r)
537 goto out_free;
538 }
539
540 r = -EFAULT;
541 if (copy_to_user(entries, cpuid_entries,
542 nent * sizeof(struct kvm_cpuid_entry2)))
543 goto out_free;
544 cpuid->nent = nent;
545 r = 0;
546
547out_free:
548 vfree(cpuid_entries);
549out:
550 return r;
551}
552
553static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
554{
555 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
556 int j, nent = vcpu->arch.cpuid_nent;
557
558 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
559 /* when no next entry is found, the current entry[i] is reselected */
560 for (j = i + 1; ; j = (j + 1) % nent) {
561 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
562 if (ej->function == e->function) {
563 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
564 return j;
565 }
566 }
567 return 0; /* silence gcc, even though control never reaches here */
568}
569
570/* find an entry with matching function, matching index (if needed), and that
571 * should be read next (if it's stateful) */
572static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
573 u32 function, u32 index)
574{
575 if (e->function != function)
576 return 0;
577 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
578 return 0;
579 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
580 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
581 return 0;
582 return 1;
583}
584
585struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
586 u32 function, u32 index)
587{
588 int i;
589 struct kvm_cpuid_entry2 *best = NULL;
590
591 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
592 struct kvm_cpuid_entry2 *e;
593
594 e = &vcpu->arch.cpuid_entries[i];
595 if (is_matching_cpuid_entry(e, function, index)) {
596 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
597 move_to_next_stateful_cpuid_entry(vcpu, i);
598 best = e;
599 break;
600 }
601 }
602 return best;
603}
604EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
605
606int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
607{
608 struct kvm_cpuid_entry2 *best;
609
610 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
611 if (!best || best->eax < 0x80000008)
612 goto not_found;
613 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
614 if (best)
615 return best->eax & 0xff;
616not_found:
617 return 36;
618}
619
620/*
621 * If no match is found, check whether we exceed the vCPU's limit
622 * and return the content of the highest valid _standard_ leaf instead.
623 * This is to satisfy the CPUID specification.
624 */
625static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
626 u32 function, u32 index)
627{
628 struct kvm_cpuid_entry2 *maxlevel;
629
630 maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
631 if (!maxlevel || maxlevel->eax >= function)
632 return NULL;
633 if (function & 0x80000000) {
634 maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
635 if (!maxlevel)
636 return NULL;
637 }
638 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
639}
640
641void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
642{
643 u32 function, index;
644 struct kvm_cpuid_entry2 *best;
645
646 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
647 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
648 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
649 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
650 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
651 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
652 best = kvm_find_cpuid_entry(vcpu, function, index);
653
654 if (!best)
655 best = check_cpuid_limit(vcpu, function, index);
656
657 if (best) {
658 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
659 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
660 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
661 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
662 }
663 kvm_x86_ops->skip_emulated_instruction(vcpu);
664 trace_kvm_cpuid(function,
665 kvm_register_read(vcpu, VCPU_REGS_RAX),
666 kvm_register_read(vcpu, VCPU_REGS_RBX),
667 kvm_register_read(vcpu, VCPU_REGS_RCX),
668 kvm_register_read(vcpu, VCPU_REGS_RDX));
669}
670EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000000..5b97e1797a6d
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,46 @@
1#ifndef ARCH_X86_KVM_CPUID_H
2#define ARCH_X86_KVM_CPUID_H
3
4#include "x86.h"
5
6void kvm_update_cpuid(struct kvm_vcpu *vcpu);
7struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
8 u32 function, u32 index);
9int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
10 struct kvm_cpuid_entry2 __user *entries);
11int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
12 struct kvm_cpuid *cpuid,
13 struct kvm_cpuid_entry __user *entries);
14int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
15 struct kvm_cpuid2 *cpuid,
16 struct kvm_cpuid_entry2 __user *entries);
17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
18 struct kvm_cpuid2 *cpuid,
19 struct kvm_cpuid_entry2 __user *entries);
20
21
22static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
23{
24 struct kvm_cpuid_entry2 *best;
25
26 best = kvm_find_cpuid_entry(vcpu, 1, 0);
27 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
28}
29
30static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
31{
32 struct kvm_cpuid_entry2 *best;
33
34 best = kvm_find_cpuid_entry(vcpu, 7, 0);
35 return best && (best->ebx & bit(X86_FEATURE_SMEP));
36}
37
38static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
39{
40 struct kvm_cpuid_entry2 *best;
41
42 best = kvm_find_cpuid_entry(vcpu, 7, 0);
43 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
44}
45
46#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f1e3be18a08f..05a562b85025 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -125,8 +125,9 @@
125#define Lock (1<<26) /* lock prefix is allowed for the instruction */ 125#define Lock (1<<26) /* lock prefix is allowed for the instruction */
126#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 126#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
127#define No64 (1<<28) 127#define No64 (1<<28)
128#define PageTable (1 << 29) /* instruction used to write page table */
128/* Source 2 operand type */ 129/* Source 2 operand type */
129#define Src2Shift (29) 130#define Src2Shift (30)
130#define Src2None (OpNone << Src2Shift) 131#define Src2None (OpNone << Src2Shift)
131#define Src2CL (OpCL << Src2Shift) 132#define Src2CL (OpCL << Src2Shift)
132#define Src2ImmByte (OpImmByte << Src2Shift) 133#define Src2ImmByte (OpImmByte << Src2Shift)
@@ -1674,11 +1675,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1674 return X86EMUL_CONTINUE; 1675 return X86EMUL_CONTINUE;
1675} 1676}
1676 1677
1677static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1678{
1679 return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
1680}
1681
1682static int em_grp2(struct x86_emulate_ctxt *ctxt) 1678static int em_grp2(struct x86_emulate_ctxt *ctxt)
1683{ 1679{
1684 switch (ctxt->modrm_reg) { 1680 switch (ctxt->modrm_reg) {
@@ -1788,7 +1784,7 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
1788 return rc; 1784 return rc;
1789} 1785}
1790 1786
1791static int em_grp9(struct x86_emulate_ctxt *ctxt) 1787static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
1792{ 1788{
1793 u64 old = ctxt->dst.orig_val64; 1789 u64 old = ctxt->dst.orig_val64;
1794 1790
@@ -1831,6 +1827,24 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
1831 return rc; 1827 return rc;
1832} 1828}
1833 1829
1830static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
1831{
1832 /* Save real source value, then compare EAX against destination. */
1833 ctxt->src.orig_val = ctxt->src.val;
1834 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
1835 emulate_2op_SrcV(ctxt, "cmp");
1836
1837 if (ctxt->eflags & EFLG_ZF) {
1838 /* Success: write back to memory. */
1839 ctxt->dst.val = ctxt->src.orig_val;
1840 } else {
1841 /* Failure: write the value we saw to EAX. */
1842 ctxt->dst.type = OP_REG;
1843 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
1844 }
1845 return X86EMUL_CONTINUE;
1846}
1847
1834static int em_lseg(struct x86_emulate_ctxt *ctxt) 1848static int em_lseg(struct x86_emulate_ctxt *ctxt)
1835{ 1849{
1836 int seg = ctxt->src2.val; 1850 int seg = ctxt->src2.val;
@@ -2481,6 +2495,15 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2481 return X86EMUL_CONTINUE; 2495 return X86EMUL_CONTINUE;
2482} 2496}
2483 2497
2498static int em_call(struct x86_emulate_ctxt *ctxt)
2499{
2500 long rel = ctxt->src.val;
2501
2502 ctxt->src.val = (unsigned long)ctxt->_eip;
2503 jmp_rel(ctxt, rel);
2504 return em_push(ctxt);
2505}
2506
2484static int em_call_far(struct x86_emulate_ctxt *ctxt) 2507static int em_call_far(struct x86_emulate_ctxt *ctxt)
2485{ 2508{
2486 u16 sel, old_cs; 2509 u16 sel, old_cs;
@@ -2622,12 +2645,75 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2622 return X86EMUL_CONTINUE; 2645 return X86EMUL_CONTINUE;
2623} 2646}
2624 2647
2648static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
2649{
2650 u64 pmc;
2651
2652 if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
2653 return emulate_gp(ctxt, 0);
2654 ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
2655 ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
2656 return X86EMUL_CONTINUE;
2657}
2658
2625static int em_mov(struct x86_emulate_ctxt *ctxt) 2659static int em_mov(struct x86_emulate_ctxt *ctxt)
2626{ 2660{
2627 ctxt->dst.val = ctxt->src.val; 2661 ctxt->dst.val = ctxt->src.val;
2628 return X86EMUL_CONTINUE; 2662 return X86EMUL_CONTINUE;
2629} 2663}
2630 2664
2665static int em_cr_write(struct x86_emulate_ctxt *ctxt)
2666{
2667 if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
2668 return emulate_gp(ctxt, 0);
2669
2670 /* Disable writeback. */
2671 ctxt->dst.type = OP_NONE;
2672 return X86EMUL_CONTINUE;
2673}
2674
2675static int em_dr_write(struct x86_emulate_ctxt *ctxt)
2676{
2677 unsigned long val;
2678
2679 if (ctxt->mode == X86EMUL_MODE_PROT64)
2680 val = ctxt->src.val & ~0ULL;
2681 else
2682 val = ctxt->src.val & ~0U;
2683
2684 /* #UD condition is already handled. */
2685 if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
2686 return emulate_gp(ctxt, 0);
2687
2688 /* Disable writeback. */
2689 ctxt->dst.type = OP_NONE;
2690 return X86EMUL_CONTINUE;
2691}
2692
2693static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
2694{
2695 u64 msr_data;
2696
2697 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
2698 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
2699 if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
2700 return emulate_gp(ctxt, 0);
2701
2702 return X86EMUL_CONTINUE;
2703}
2704
2705static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
2706{
2707 u64 msr_data;
2708
2709 if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
2710 return emulate_gp(ctxt, 0);
2711
2712 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
2713 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
2714 return X86EMUL_CONTINUE;
2715}
2716
2631static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) 2717static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
2632{ 2718{
2633 if (ctxt->modrm_reg > VCPU_SREG_GS) 2719 if (ctxt->modrm_reg > VCPU_SREG_GS)
@@ -2775,6 +2861,24 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt)
2775 return X86EMUL_CONTINUE; 2861 return X86EMUL_CONTINUE;
2776} 2862}
2777 2863
2864static int em_in(struct x86_emulate_ctxt *ctxt)
2865{
2866 if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
2867 &ctxt->dst.val))
2868 return X86EMUL_IO_NEEDED;
2869
2870 return X86EMUL_CONTINUE;
2871}
2872
2873static int em_out(struct x86_emulate_ctxt *ctxt)
2874{
2875 ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
2876 &ctxt->src.val, 1);
2877 /* Disable writeback. */
2878 ctxt->dst.type = OP_NONE;
2879 return X86EMUL_CONTINUE;
2880}
2881
2778static int em_cli(struct x86_emulate_ctxt *ctxt) 2882static int em_cli(struct x86_emulate_ctxt *ctxt)
2779{ 2883{
2780 if (emulator_bad_iopl(ctxt)) 2884 if (emulator_bad_iopl(ctxt))
@@ -2794,6 +2898,69 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
2794 return X86EMUL_CONTINUE; 2898 return X86EMUL_CONTINUE;
2795} 2899}
2796 2900
2901static int em_bt(struct x86_emulate_ctxt *ctxt)
2902{
2903 /* Disable writeback. */
2904 ctxt->dst.type = OP_NONE;
2905 /* only subword offset */
2906 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
2907
2908 emulate_2op_SrcV_nobyte(ctxt, "bt");
2909 return X86EMUL_CONTINUE;
2910}
2911
2912static int em_bts(struct x86_emulate_ctxt *ctxt)
2913{
2914 emulate_2op_SrcV_nobyte(ctxt, "bts");
2915 return X86EMUL_CONTINUE;
2916}
2917
2918static int em_btr(struct x86_emulate_ctxt *ctxt)
2919{
2920 emulate_2op_SrcV_nobyte(ctxt, "btr");
2921 return X86EMUL_CONTINUE;
2922}
2923
2924static int em_btc(struct x86_emulate_ctxt *ctxt)
2925{
2926 emulate_2op_SrcV_nobyte(ctxt, "btc");
2927 return X86EMUL_CONTINUE;
2928}
2929
2930static int em_bsf(struct x86_emulate_ctxt *ctxt)
2931{
2932 u8 zf;
2933
2934 __asm__ ("bsf %2, %0; setz %1"
2935 : "=r"(ctxt->dst.val), "=q"(zf)
2936 : "r"(ctxt->src.val));
2937
2938 ctxt->eflags &= ~X86_EFLAGS_ZF;
2939 if (zf) {
2940 ctxt->eflags |= X86_EFLAGS_ZF;
2941 /* Disable writeback. */
2942 ctxt->dst.type = OP_NONE;
2943 }
2944 return X86EMUL_CONTINUE;
2945}
2946
2947static int em_bsr(struct x86_emulate_ctxt *ctxt)
2948{
2949 u8 zf;
2950
2951 __asm__ ("bsr %2, %0; setz %1"
2952 : "=r"(ctxt->dst.val), "=q"(zf)
2953 : "r"(ctxt->src.val));
2954
2955 ctxt->eflags &= ~X86_EFLAGS_ZF;
2956 if (zf) {
2957 ctxt->eflags |= X86_EFLAGS_ZF;
2958 /* Disable writeback. */
2959 ctxt->dst.type = OP_NONE;
2960 }
2961 return X86EMUL_CONTINUE;
2962}
2963
2797static bool valid_cr(int nr) 2964static bool valid_cr(int nr)
2798{ 2965{
2799 switch (nr) { 2966 switch (nr) {
@@ -2867,9 +3034,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2867 break; 3034 break;
2868 } 3035 }
2869 case 4: { 3036 case 4: {
2870 u64 cr4;
2871
2872 cr4 = ctxt->ops->get_cr(ctxt, 4);
2873 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 3037 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2874 3038
2875 if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) 3039 if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
@@ -3003,6 +3167,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3003#define D2bv(_f) D((_f) | ByteOp), D(_f) 3167#define D2bv(_f) D((_f) | ByteOp), D(_f)
3004#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) 3168#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
3005#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) 3169#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
3170#define I2bvIP(_f, _e, _i, _p) \
3171 IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
3006 3172
3007#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ 3173#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
3008 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3174 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
@@ -3033,17 +3199,17 @@ static struct opcode group7_rm7[] = {
3033 3199
3034static struct opcode group1[] = { 3200static struct opcode group1[] = {
3035 I(Lock, em_add), 3201 I(Lock, em_add),
3036 I(Lock, em_or), 3202 I(Lock | PageTable, em_or),
3037 I(Lock, em_adc), 3203 I(Lock, em_adc),
3038 I(Lock, em_sbb), 3204 I(Lock, em_sbb),
3039 I(Lock, em_and), 3205 I(Lock | PageTable, em_and),
3040 I(Lock, em_sub), 3206 I(Lock, em_sub),
3041 I(Lock, em_xor), 3207 I(Lock, em_xor),
3042 I(0, em_cmp), 3208 I(0, em_cmp),
3043}; 3209};
3044 3210
3045static struct opcode group1A[] = { 3211static struct opcode group1A[] = {
3046 D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, 3212 I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3047}; 3213};
3048 3214
3049static struct opcode group3[] = { 3215static struct opcode group3[] = {
@@ -3058,16 +3224,19 @@ static struct opcode group3[] = {
3058}; 3224};
3059 3225
3060static struct opcode group4[] = { 3226static struct opcode group4[] = {
3061 D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), 3227 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
3228 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
3062 N, N, N, N, N, N, 3229 N, N, N, N, N, N,
3063}; 3230};
3064 3231
3065static struct opcode group5[] = { 3232static struct opcode group5[] = {
3066 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), 3233 I(DstMem | SrcNone | ModRM | Lock, em_grp45),
3067 D(SrcMem | ModRM | Stack), 3234 I(DstMem | SrcNone | ModRM | Lock, em_grp45),
3235 I(SrcMem | ModRM | Stack, em_grp45),
3068 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), 3236 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
3069 D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), 3237 I(SrcMem | ModRM | Stack, em_grp45),
3070 D(SrcMem | ModRM | Stack), N, 3238 I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
3239 I(SrcMem | ModRM | Stack, em_grp45), N,
3071}; 3240};
3072 3241
3073static struct opcode group6[] = { 3242static struct opcode group6[] = {
@@ -3096,18 +3265,21 @@ static struct group_dual group7 = { {
3096 3265
3097static struct opcode group8[] = { 3266static struct opcode group8[] = {
3098 N, N, N, N, 3267 N, N, N, N,
3099 D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), 3268 I(DstMem | SrcImmByte | ModRM, em_bt),
3100 D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), 3269 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
3270 I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
3271 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
3101}; 3272};
3102 3273
3103static struct group_dual group9 = { { 3274static struct group_dual group9 = { {
3104 N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, 3275 N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
3105}, { 3276}, {
3106 N, N, N, N, N, N, N, N, 3277 N, N, N, N, N, N, N, N,
3107} }; 3278} };
3108 3279
3109static struct opcode group11[] = { 3280static struct opcode group11[] = {
3110 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), 3281 I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
3282 X7(D(Undefined)),
3111}; 3283};
3112 3284
3113static struct gprefix pfx_0f_6f_0f_7f = { 3285static struct gprefix pfx_0f_6f_0f_7f = {
@@ -3120,7 +3292,7 @@ static struct opcode opcode_table[256] = {
3120 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), 3292 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
3121 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), 3293 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
3122 /* 0x08 - 0x0F */ 3294 /* 0x08 - 0x0F */
3123 I6ALU(Lock, em_or), 3295 I6ALU(Lock | PageTable, em_or),
3124 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), 3296 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
3125 N, 3297 N,
3126 /* 0x10 - 0x17 */ 3298 /* 0x10 - 0x17 */
@@ -3132,7 +3304,7 @@ static struct opcode opcode_table[256] = {
3132 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), 3304 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
3133 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), 3305 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
3134 /* 0x20 - 0x27 */ 3306 /* 0x20 - 0x27 */
3135 I6ALU(Lock, em_and), N, N, 3307 I6ALU(Lock | PageTable, em_and), N, N,
3136 /* 0x28 - 0x2F */ 3308 /* 0x28 - 0x2F */
3137 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), 3309 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
3138 /* 0x30 - 0x37 */ 3310 /* 0x30 - 0x37 */
@@ -3155,8 +3327,8 @@ static struct opcode opcode_table[256] = {
3155 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), 3327 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
3156 I(SrcImmByte | Mov | Stack, em_push), 3328 I(SrcImmByte | Mov | Stack, em_push),
3157 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), 3329 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
3158 D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */ 3330 I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
3159 D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */ 3331 I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
3160 /* 0x70 - 0x7F */ 3332 /* 0x70 - 0x7F */
3161 X16(D(SrcImmByte)), 3333 X16(D(SrcImmByte)),
3162 /* 0x80 - 0x87 */ 3334 /* 0x80 - 0x87 */
@@ -3165,11 +3337,11 @@ static struct opcode opcode_table[256] = {
3165 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), 3337 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
3166 G(DstMem | SrcImmByte | ModRM | Group, group1), 3338 G(DstMem | SrcImmByte | ModRM | Group, group1),
3167 I2bv(DstMem | SrcReg | ModRM, em_test), 3339 I2bv(DstMem | SrcReg | ModRM, em_test),
3168 I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg), 3340 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
3169 /* 0x88 - 0x8F */ 3341 /* 0x88 - 0x8F */
3170 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), 3342 I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
3171 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), 3343 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
3172 I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg), 3344 I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
3173 D(ModRM | SrcMem | NoAccess | DstReg), 3345 D(ModRM | SrcMem | NoAccess | DstReg),
3174 I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), 3346 I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
3175 G(0, group1A), 3347 G(0, group1A),
@@ -3182,7 +3354,7 @@ static struct opcode opcode_table[256] = {
3182 II(ImplicitOps | Stack, em_popf, popf), N, N, 3354 II(ImplicitOps | Stack, em_popf, popf), N, N,
3183 /* 0xA0 - 0xA7 */ 3355 /* 0xA0 - 0xA7 */
3184 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3356 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3185 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), 3357 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
3186 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3358 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3187 I2bv(SrcSI | DstDI | String, em_cmp), 3359 I2bv(SrcSI | DstDI | String, em_cmp),
3188 /* 0xA8 - 0xAF */ 3360 /* 0xA8 - 0xAF */
@@ -3213,13 +3385,13 @@ static struct opcode opcode_table[256] = {
3213 /* 0xE0 - 0xE7 */ 3385 /* 0xE0 - 0xE7 */
3214 X3(I(SrcImmByte, em_loop)), 3386 X3(I(SrcImmByte, em_loop)),
3215 I(SrcImmByte, em_jcxz), 3387 I(SrcImmByte, em_jcxz),
3216 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), 3388 I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
3217 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), 3389 I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
3218 /* 0xE8 - 0xEF */ 3390 /* 0xE8 - 0xEF */
3219 D(SrcImm | Stack), D(SrcImm | ImplicitOps), 3391 I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps),
3220 I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), 3392 I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
3221 D2bvIP(SrcDX | DstAcc, in, check_perm_in), 3393 I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
3222 D2bvIP(SrcAcc | DstDX, out, check_perm_out), 3394 I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
3223 /* 0xF0 - 0xF7 */ 3395 /* 0xF0 - 0xF7 */
3224 N, DI(ImplicitOps, icebp), N, N, 3396 N, DI(ImplicitOps, icebp), N, N,
3225 DI(ImplicitOps | Priv, hlt), D(ImplicitOps), 3397 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
@@ -3242,15 +3414,15 @@ static struct opcode twobyte_table[256] = {
3242 /* 0x20 - 0x2F */ 3414 /* 0x20 - 0x2F */
3243 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), 3415 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
3244 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), 3416 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
3245 DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), 3417 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
3246 DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), 3418 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
3247 N, N, N, N, 3419 N, N, N, N,
3248 N, N, N, N, N, N, N, N, 3420 N, N, N, N, N, N, N, N,
3249 /* 0x30 - 0x3F */ 3421 /* 0x30 - 0x3F */
3250 DI(ImplicitOps | Priv, wrmsr), 3422 II(ImplicitOps | Priv, em_wrmsr, wrmsr),
3251 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 3423 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3252 DI(ImplicitOps | Priv, rdmsr), 3424 II(ImplicitOps | Priv, em_rdmsr, rdmsr),
3253 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), 3425 IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
3254 I(ImplicitOps | VendorSpecific, em_sysenter), 3426 I(ImplicitOps | VendorSpecific, em_sysenter),
3255 I(ImplicitOps | Priv | VendorSpecific, em_sysexit), 3427 I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
3256 N, N, 3428 N, N,
@@ -3275,26 +3447,28 @@ static struct opcode twobyte_table[256] = {
3275 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3447 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3276 /* 0xA0 - 0xA7 */ 3448 /* 0xA0 - 0xA7 */
3277 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3449 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3278 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), 3450 DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
3279 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3451 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3280 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3452 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3281 /* 0xA8 - 0xAF */ 3453 /* 0xA8 - 0xAF */
3282 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), 3454 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
3283 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), 3455 DI(ImplicitOps, rsm),
3456 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
3284 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3457 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3285 D(DstMem | SrcReg | Src2CL | ModRM), 3458 D(DstMem | SrcReg | Src2CL | ModRM),
3286 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 3459 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
3287 /* 0xB0 - 0xB7 */ 3460 /* 0xB0 - 0xB7 */
3288 D2bv(DstMem | SrcReg | ModRM | Lock), 3461 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
3289 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), 3462 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
3290 D(DstMem | SrcReg | ModRM | BitOp | Lock), 3463 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
3291 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 3464 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3292 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), 3465 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3293 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3466 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3294 /* 0xB8 - 0xBF */ 3467 /* 0xB8 - 0xBF */
3295 N, N, 3468 N, N,
3296 G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), 3469 G(BitOp, group8),
3297 D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), 3470 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3471 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3298 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3472 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3299 /* 0xC0 - 0xCF */ 3473 /* 0xC0 - 0xCF */
3300 D2bv(DstMem | SrcReg | ModRM | Lock), 3474 D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -3320,6 +3494,7 @@ static struct opcode twobyte_table[256] = {
3320#undef D2bv 3494#undef D2bv
3321#undef D2bvIP 3495#undef D2bvIP
3322#undef I2bv 3496#undef I2bv
3497#undef I2bvIP
3323#undef I6ALU 3498#undef I6ALU
3324 3499
3325static unsigned imm_size(struct x86_emulate_ctxt *ctxt) 3500static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
@@ -3697,6 +3872,11 @@ done:
3697 return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; 3872 return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
3698} 3873}
3699 3874
3875bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
3876{
3877 return ctxt->d & PageTable;
3878}
3879
3700static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3880static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3701{ 3881{
3702 /* The second termination condition only applies for REPE 3882 /* The second termination condition only applies for REPE
@@ -3720,7 +3900,6 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3720int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 3900int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3721{ 3901{
3722 struct x86_emulate_ops *ops = ctxt->ops; 3902 struct x86_emulate_ops *ops = ctxt->ops;
3723 u64 msr_data;
3724 int rc = X86EMUL_CONTINUE; 3903 int rc = X86EMUL_CONTINUE;
3725 int saved_dst_type = ctxt->dst.type; 3904 int saved_dst_type = ctxt->dst.type;
3726 3905
@@ -3854,15 +4033,6 @@ special_insn:
3854 goto cannot_emulate; 4033 goto cannot_emulate;
3855 ctxt->dst.val = (s32) ctxt->src.val; 4034 ctxt->dst.val = (s32) ctxt->src.val;
3856 break; 4035 break;
3857 case 0x6c: /* insb */
3858 case 0x6d: /* insw/insd */
3859 ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
3860 goto do_io_in;
3861 case 0x6e: /* outsb */
3862 case 0x6f: /* outsw/outsd */
3863 ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
3864 goto do_io_out;
3865 break;
3866 case 0x70 ... 0x7f: /* jcc (short) */ 4036 case 0x70 ... 0x7f: /* jcc (short) */
3867 if (test_cc(ctxt->b, ctxt->eflags)) 4037 if (test_cc(ctxt->b, ctxt->eflags))
3868 jmp_rel(ctxt, ctxt->src.val); 4038 jmp_rel(ctxt, ctxt->src.val);
@@ -3870,9 +4040,6 @@ special_insn:
3870 case 0x8d: /* lea r16/r32, m */ 4040 case 0x8d: /* lea r16/r32, m */
3871 ctxt->dst.val = ctxt->src.addr.mem.ea; 4041 ctxt->dst.val = ctxt->src.addr.mem.ea;
3872 break; 4042 break;
3873 case 0x8f: /* pop (sole member of Grp1a) */
3874 rc = em_grp1a(ctxt);
3875 break;
3876 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 4043 case 0x90 ... 0x97: /* nop / xchg reg, rax */
3877 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) 4044 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
3878 break; 4045 break;
@@ -3905,38 +4072,11 @@ special_insn:
3905 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; 4072 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
3906 rc = em_grp2(ctxt); 4073 rc = em_grp2(ctxt);
3907 break; 4074 break;
3908 case 0xe4: /* inb */
3909 case 0xe5: /* in */
3910 goto do_io_in;
3911 case 0xe6: /* outb */
3912 case 0xe7: /* out */
3913 goto do_io_out;
3914 case 0xe8: /* call (near) */ {
3915 long int rel = ctxt->src.val;
3916 ctxt->src.val = (unsigned long) ctxt->_eip;
3917 jmp_rel(ctxt, rel);
3918 rc = em_push(ctxt);
3919 break;
3920 }
3921 case 0xe9: /* jmp rel */ 4075 case 0xe9: /* jmp rel */
3922 case 0xeb: /* jmp rel short */ 4076 case 0xeb: /* jmp rel short */
3923 jmp_rel(ctxt, ctxt->src.val); 4077 jmp_rel(ctxt, ctxt->src.val);
3924 ctxt->dst.type = OP_NONE; /* Disable writeback. */ 4078 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3925 break; 4079 break;
3926 case 0xec: /* in al,dx */
3927 case 0xed: /* in (e/r)ax,dx */
3928 do_io_in:
3929 if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
3930 &ctxt->dst.val))
3931 goto done; /* IO is needed */
3932 break;
3933 case 0xee: /* out dx,al */
3934 case 0xef: /* out dx,(e/r)ax */
3935 do_io_out:
3936 ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
3937 &ctxt->src.val, 1);
3938 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3939 break;
3940 case 0xf4: /* hlt */ 4080 case 0xf4: /* hlt */
3941 ctxt->ops->halt(ctxt); 4081 ctxt->ops->halt(ctxt);
3942 break; 4082 break;
@@ -3956,12 +4096,6 @@ special_insn:
3956 case 0xfd: /* std */ 4096 case 0xfd: /* std */
3957 ctxt->eflags |= EFLG_DF; 4097 ctxt->eflags |= EFLG_DF;
3958 break; 4098 break;
3959 case 0xfe: /* Grp4 */
3960 rc = em_grp45(ctxt);
3961 break;
3962 case 0xff: /* Grp5 */
3963 rc = em_grp45(ctxt);
3964 break;
3965 default: 4099 default:
3966 goto cannot_emulate; 4100 goto cannot_emulate;
3967 } 4101 }
@@ -4036,49 +4170,6 @@ twobyte_insn:
4036 case 0x21: /* mov from dr to reg */ 4170 case 0x21: /* mov from dr to reg */
4037 ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); 4171 ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
4038 break; 4172 break;
4039 case 0x22: /* mov reg, cr */
4040 if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
4041 emulate_gp(ctxt, 0);
4042 rc = X86EMUL_PROPAGATE_FAULT;
4043 goto done;
4044 }
4045 ctxt->dst.type = OP_NONE;
4046 break;
4047 case 0x23: /* mov from reg to dr */
4048 if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
4049 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
4050 ~0ULL : ~0U)) < 0) {
4051 /* #UD condition is already handled by the code above */
4052 emulate_gp(ctxt, 0);
4053 rc = X86EMUL_PROPAGATE_FAULT;
4054 goto done;
4055 }
4056
4057 ctxt->dst.type = OP_NONE; /* no writeback */
4058 break;
4059 case 0x30:
4060 /* wrmsr */
4061 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
4062 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
4063 if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
4064 emulate_gp(ctxt, 0);
4065 rc = X86EMUL_PROPAGATE_FAULT;
4066 goto done;
4067 }
4068 rc = X86EMUL_CONTINUE;
4069 break;
4070 case 0x32:
4071 /* rdmsr */
4072 if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
4073 emulate_gp(ctxt, 0);
4074 rc = X86EMUL_PROPAGATE_FAULT;
4075 goto done;
4076 } else {
4077 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
4078 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
4079 }
4080 rc = X86EMUL_CONTINUE;
4081 break;
4082 case 0x40 ... 0x4f: /* cmov */ 4173 case 0x40 ... 0x4f: /* cmov */
4083 ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; 4174 ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
4084 if (!test_cc(ctxt->b, ctxt->eflags)) 4175 if (!test_cc(ctxt->b, ctxt->eflags))
@@ -4091,93 +4182,21 @@ twobyte_insn:
4091 case 0x90 ... 0x9f: /* setcc r/m8 */ 4182 case 0x90 ... 0x9f: /* setcc r/m8 */
4092 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); 4183 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4093 break; 4184 break;
4094 case 0xa3:
4095 bt: /* bt */
4096 ctxt->dst.type = OP_NONE;
4097 /* only subword offset */
4098 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
4099 emulate_2op_SrcV_nobyte(ctxt, "bt");
4100 break;
4101 case 0xa4: /* shld imm8, r, r/m */ 4185 case 0xa4: /* shld imm8, r, r/m */
4102 case 0xa5: /* shld cl, r, r/m */ 4186 case 0xa5: /* shld cl, r, r/m */
4103 emulate_2op_cl(ctxt, "shld"); 4187 emulate_2op_cl(ctxt, "shld");
4104 break; 4188 break;
4105 case 0xab:
4106 bts: /* bts */
4107 emulate_2op_SrcV_nobyte(ctxt, "bts");
4108 break;
4109 case 0xac: /* shrd imm8, r, r/m */ 4189 case 0xac: /* shrd imm8, r, r/m */
4110 case 0xad: /* shrd cl, r, r/m */ 4190 case 0xad: /* shrd cl, r, r/m */
4111 emulate_2op_cl(ctxt, "shrd"); 4191 emulate_2op_cl(ctxt, "shrd");
4112 break; 4192 break;
4113 case 0xae: /* clflush */ 4193 case 0xae: /* clflush */
4114 break; 4194 break;
4115 case 0xb0 ... 0xb1: /* cmpxchg */
4116 /*
4117 * Save real source value, then compare EAX against
4118 * destination.
4119 */
4120 ctxt->src.orig_val = ctxt->src.val;
4121 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
4122 emulate_2op_SrcV(ctxt, "cmp");
4123 if (ctxt->eflags & EFLG_ZF) {
4124 /* Success: write back to memory. */
4125 ctxt->dst.val = ctxt->src.orig_val;
4126 } else {
4127 /* Failure: write the value we saw to EAX. */
4128 ctxt->dst.type = OP_REG;
4129 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
4130 }
4131 break;
4132 case 0xb3:
4133 btr: /* btr */
4134 emulate_2op_SrcV_nobyte(ctxt, "btr");
4135 break;
4136 case 0xb6 ... 0xb7: /* movzx */ 4195 case 0xb6 ... 0xb7: /* movzx */
4137 ctxt->dst.bytes = ctxt->op_bytes; 4196 ctxt->dst.bytes = ctxt->op_bytes;
4138 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val 4197 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
4139 : (u16) ctxt->src.val; 4198 : (u16) ctxt->src.val;
4140 break; 4199 break;
4141 case 0xba: /* Grp8 */
4142 switch (ctxt->modrm_reg & 3) {
4143 case 0:
4144 goto bt;
4145 case 1:
4146 goto bts;
4147 case 2:
4148 goto btr;
4149 case 3:
4150 goto btc;
4151 }
4152 break;
4153 case 0xbb:
4154 btc: /* btc */
4155 emulate_2op_SrcV_nobyte(ctxt, "btc");
4156 break;
4157 case 0xbc: { /* bsf */
4158 u8 zf;
4159 __asm__ ("bsf %2, %0; setz %1"
4160 : "=r"(ctxt->dst.val), "=q"(zf)
4161 : "r"(ctxt->src.val));
4162 ctxt->eflags &= ~X86_EFLAGS_ZF;
4163 if (zf) {
4164 ctxt->eflags |= X86_EFLAGS_ZF;
4165 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4166 }
4167 break;
4168 }
4169 case 0xbd: { /* bsr */
4170 u8 zf;
4171 __asm__ ("bsr %2, %0; setz %1"
4172 : "=r"(ctxt->dst.val), "=q"(zf)
4173 : "r"(ctxt->src.val));
4174 ctxt->eflags &= ~X86_EFLAGS_ZF;
4175 if (zf) {
4176 ctxt->eflags |= X86_EFLAGS_ZF;
4177 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4178 }
4179 break;
4180 }
4181 case 0xbe ... 0xbf: /* movsx */ 4200 case 0xbe ... 0xbf: /* movsx */
4182 ctxt->dst.bytes = ctxt->op_bytes; 4201 ctxt->dst.bytes = ctxt->op_bytes;
4183 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : 4202 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
@@ -4194,9 +4213,6 @@ twobyte_insn:
4194 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : 4213 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
4195 (u64) ctxt->src.val; 4214 (u64) ctxt->src.val;
4196 break; 4215 break;
4197 case 0xc7: /* Grp9 (cmpxchg8b) */
4198 rc = em_grp9(ctxt);
4199 break;
4200 default: 4216 default:
4201 goto cannot_emulate; 4217 goto cannot_emulate;
4202 } 4218 }
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 76e3f1cd0369..d68f99df690c 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -338,11 +338,15 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
338 return HRTIMER_NORESTART; 338 return HRTIMER_NORESTART;
339} 339}
340 340
341static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 341static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
342{ 342{
343 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
343 struct kvm_timer *pt = &ps->pit_timer; 344 struct kvm_timer *pt = &ps->pit_timer;
344 s64 interval; 345 s64 interval;
345 346
347 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
348 return;
349
346 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 350 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
347 351
348 pr_debug("create pit timer, interval is %llu nsec\n", interval); 352 pr_debug("create pit timer, interval is %llu nsec\n", interval);
@@ -393,15 +397,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
393 case 1: 397 case 1:
394 /* FIXME: enhance mode 4 precision */ 398 /* FIXME: enhance mode 4 precision */
395 case 4: 399 case 4:
396 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { 400 create_pit_timer(kvm, val, 0);
397 create_pit_timer(ps, val, 0);
398 }
399 break; 401 break;
400 case 2: 402 case 2:
401 case 3: 403 case 3:
402 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ 404 create_pit_timer(kvm, val, 1);
403 create_pit_timer(ps, val, 1);
404 }
405 break; 405 break;
406 default: 406 default:
407 destroy_pit_timer(kvm->arch.vpit); 407 destroy_pit_timer(kvm->arch.vpit);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cac4746d7ffb..b6a73537e1ef 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -262,9 +262,10 @@ int kvm_pic_read_irq(struct kvm *kvm)
262 262
263void kvm_pic_reset(struct kvm_kpic_state *s) 263void kvm_pic_reset(struct kvm_kpic_state *s)
264{ 264{
265 int irq; 265 int irq, i;
266 struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; 266 struct kvm_vcpu *vcpu;
267 u8 irr = s->irr, isr = s->imr; 267 u8 irr = s->irr, isr = s->imr;
268 bool found = false;
268 269
269 s->last_irr = 0; 270 s->last_irr = 0;
270 s->irr = 0; 271 s->irr = 0;
@@ -281,12 +282,19 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
281 s->special_fully_nested_mode = 0; 282 s->special_fully_nested_mode = 0;
282 s->init4 = 0; 283 s->init4 = 0;
283 284
284 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { 285 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
285 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) 286 if (kvm_apic_accept_pic_intr(vcpu)) {
286 if (irr & (1 << irq) || isr & (1 << irq)) { 287 found = true;
287 pic_clear_isr(s, irq); 288 break;
288 } 289 }
289 } 290
291
292 if (!found)
293 return;
294
295 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
296 if (irr & (1 << irq) || isr & (1 << irq))
297 pic_clear_isr(s, irq);
290} 298}
291 299
292static void pic_ioport_write(void *opaque, u32 addr, u32 val) 300static void pic_ioport_write(void *opaque, u32 addr, u32 val)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 54abb40199d6..cfdc6e0ef002 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -38,6 +38,7 @@
38#include "irq.h" 38#include "irq.h"
39#include "trace.h" 39#include "trace.h"
40#include "x86.h" 40#include "x86.h"
41#include "cpuid.h"
41 42
42#ifndef CONFIG_X86_64 43#ifndef CONFIG_X86_64
43#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) 44#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -1120,7 +1121,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
1120 return 0; 1121 return 0;
1121} 1122}
1122 1123
1123static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) 1124int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
1124{ 1125{
1125 u32 reg = apic_get_reg(apic, lvt_type); 1126 u32 reg = apic_get_reg(apic, lvt_type);
1126 int vector, mode, trig_mode; 1127 int vector, mode, trig_mode;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 138e8cc6fea6..6f4ce2575d09 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -34,6 +34,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
34int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 34int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
35int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 35int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
36int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 36int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
37int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
37 38
38u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
39void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f1b36cf3e3d0..224b02c3cda9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -59,15 +59,6 @@ enum {
59 AUDIT_POST_SYNC 59 AUDIT_POST_SYNC
60}; 60};
61 61
62char *audit_point_name[] = {
63 "pre page fault",
64 "post page fault",
65 "pre pte write",
66 "post pte write",
67 "pre sync",
68 "post sync"
69};
70
71#undef MMU_DEBUG 62#undef MMU_DEBUG
72 63
73#ifdef MMU_DEBUG 64#ifdef MMU_DEBUG
@@ -83,13 +74,10 @@ char *audit_point_name[] = {
83#endif 74#endif
84 75
85#ifdef MMU_DEBUG 76#ifdef MMU_DEBUG
86static int dbg = 0; 77static bool dbg = 0;
87module_param(dbg, bool, 0644); 78module_param(dbg, bool, 0644);
88#endif 79#endif
89 80
90static int oos_shadow = 1;
91module_param(oos_shadow, bool, 0644);
92
93#ifndef MMU_DEBUG 81#ifndef MMU_DEBUG
94#define ASSERT(x) do { } while (0) 82#define ASSERT(x) do { } while (0)
95#else 83#else
@@ -593,6 +581,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
593 return 0; 581 return 0;
594} 582}
595 583
584static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
585{
586 return cache->nobjs;
587}
588
596static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, 589static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
597 struct kmem_cache *cache) 590 struct kmem_cache *cache)
598{ 591{
@@ -953,21 +946,35 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
953 } 946 }
954} 947}
955 948
949static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
950 struct kvm_memory_slot *slot)
951{
952 struct kvm_lpage_info *linfo;
953
954 if (likely(level == PT_PAGE_TABLE_LEVEL))
955 return &slot->rmap[gfn - slot->base_gfn];
956
957 linfo = lpage_info_slot(gfn, slot, level);
958 return &linfo->rmap_pde;
959}
960
956/* 961/*
957 * Take gfn and return the reverse mapping to it. 962 * Take gfn and return the reverse mapping to it.
958 */ 963 */
959static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 964static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
960{ 965{
961 struct kvm_memory_slot *slot; 966 struct kvm_memory_slot *slot;
962 struct kvm_lpage_info *linfo;
963 967
964 slot = gfn_to_memslot(kvm, gfn); 968 slot = gfn_to_memslot(kvm, gfn);
965 if (likely(level == PT_PAGE_TABLE_LEVEL)) 969 return __gfn_to_rmap(kvm, gfn, level, slot);
966 return &slot->rmap[gfn - slot->base_gfn]; 970}
967 971
968 linfo = lpage_info_slot(gfn, slot, level); 972static bool rmap_can_add(struct kvm_vcpu *vcpu)
973{
974 struct kvm_mmu_memory_cache *cache;
969 975
970 return &linfo->rmap_pde; 976 cache = &vcpu->arch.mmu_pte_list_desc_cache;
977 return mmu_memory_cache_free_objects(cache);
971} 978}
972 979
973static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 980static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1004,17 +1011,16 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
1004 rmap_remove(kvm, sptep); 1011 rmap_remove(kvm, sptep);
1005} 1012}
1006 1013
1007static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1014int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1015 struct kvm_memory_slot *slot)
1008{ 1016{
1009 unsigned long *rmapp; 1017 unsigned long *rmapp;
1010 u64 *spte; 1018 u64 *spte;
1011 int i, write_protected = 0; 1019 int i, write_protected = 0;
1012 1020
1013 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 1021 rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
1014
1015 spte = rmap_next(kvm, rmapp, NULL); 1022 spte = rmap_next(kvm, rmapp, NULL);
1016 while (spte) { 1023 while (spte) {
1017 BUG_ON(!spte);
1018 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1024 BUG_ON(!(*spte & PT_PRESENT_MASK));
1019 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1025 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
1020 if (is_writable_pte(*spte)) { 1026 if (is_writable_pte(*spte)) {
@@ -1027,12 +1033,11 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
1027 /* check for huge page mappings */ 1033 /* check for huge page mappings */
1028 for (i = PT_DIRECTORY_LEVEL; 1034 for (i = PT_DIRECTORY_LEVEL;
1029 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1035 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1030 rmapp = gfn_to_rmap(kvm, gfn, i); 1036 rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
1031 spte = rmap_next(kvm, rmapp, NULL); 1037 spte = rmap_next(kvm, rmapp, NULL);
1032 while (spte) { 1038 while (spte) {
1033 BUG_ON(!spte);
1034 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1039 BUG_ON(!(*spte & PT_PRESENT_MASK));
1035 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 1040 BUG_ON(!is_large_pte(*spte));
1036 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 1041 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
1037 if (is_writable_pte(*spte)) { 1042 if (is_writable_pte(*spte)) {
1038 drop_spte(kvm, spte); 1043 drop_spte(kvm, spte);
@@ -1047,6 +1052,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
1047 return write_protected; 1052 return write_protected;
1048} 1053}
1049 1054
1055static int rmap_write_protect(struct kvm *kvm, u64 gfn)
1056{
1057 struct kvm_memory_slot *slot;
1058
1059 slot = gfn_to_memslot(kvm, gfn);
1060 return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
1061}
1062
1050static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1063static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1051 unsigned long data) 1064 unsigned long data)
1052{ 1065{
@@ -1103,15 +1116,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1103 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 1116 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
1104 unsigned long data)) 1117 unsigned long data))
1105{ 1118{
1106 int i, j; 1119 int j;
1107 int ret; 1120 int ret;
1108 int retval = 0; 1121 int retval = 0;
1109 struct kvm_memslots *slots; 1122 struct kvm_memslots *slots;
1123 struct kvm_memory_slot *memslot;
1110 1124
1111 slots = kvm_memslots(kvm); 1125 slots = kvm_memslots(kvm);
1112 1126
1113 for (i = 0; i < slots->nmemslots; i++) { 1127 kvm_for_each_memslot(memslot, slots) {
1114 struct kvm_memory_slot *memslot = &slots->memslots[i];
1115 unsigned long start = memslot->userspace_addr; 1128 unsigned long start = memslot->userspace_addr;
1116 unsigned long end; 1129 unsigned long end;
1117 1130
@@ -1324,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1324 PAGE_SIZE); 1337 PAGE_SIZE);
1325 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1338 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1326 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1339 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1327 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1340 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
1328 sp->parent_ptes = 0; 1341 sp->parent_ptes = 0;
1329 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1342 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1330 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1343 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1511,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1511 return ret; 1524 return ret;
1512} 1525}
1513 1526
1527#ifdef CONFIG_KVM_MMU_AUDIT
1528#include "mmu_audit.c"
1529#else
1530static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
1531static void mmu_audit_disable(void) { }
1532#endif
1533
1514static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1534static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1515 struct list_head *invalid_list) 1535 struct list_head *invalid_list)
1516{ 1536{
@@ -1640,6 +1660,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp)
1640 sp->spt[i] = 0ull; 1660 sp->spt[i] = 0ull;
1641} 1661}
1642 1662
1663static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
1664{
1665 sp->write_flooding_count = 0;
1666}
1667
1668static void clear_sp_write_flooding_count(u64 *spte)
1669{
1670 struct kvm_mmu_page *sp = page_header(__pa(spte));
1671
1672 __clear_sp_write_flooding_count(sp);
1673}
1674
1643static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1675static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1644 gfn_t gfn, 1676 gfn_t gfn,
1645 gva_t gaddr, 1677 gva_t gaddr,
@@ -1683,6 +1715,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1683 } else if (sp->unsync) 1715 } else if (sp->unsync)
1684 kvm_mmu_mark_parents_unsync(sp); 1716 kvm_mmu_mark_parents_unsync(sp);
1685 1717
1718 __clear_sp_write_flooding_count(sp);
1686 trace_kvm_mmu_get_page(sp, false); 1719 trace_kvm_mmu_get_page(sp, false);
1687 return sp; 1720 return sp;
1688 } 1721 }
@@ -1796,7 +1829,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1796 } 1829 }
1797} 1830}
1798 1831
1799static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 1832static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1800 u64 *spte) 1833 u64 *spte)
1801{ 1834{
1802 u64 pte; 1835 u64 pte;
@@ -1804,17 +1837,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1804 1837
1805 pte = *spte; 1838 pte = *spte;
1806 if (is_shadow_present_pte(pte)) { 1839 if (is_shadow_present_pte(pte)) {
1807 if (is_last_spte(pte, sp->role.level)) 1840 if (is_last_spte(pte, sp->role.level)) {
1808 drop_spte(kvm, spte); 1841 drop_spte(kvm, spte);
1809 else { 1842 if (is_large_pte(pte))
1843 --kvm->stat.lpages;
1844 } else {
1810 child = page_header(pte & PT64_BASE_ADDR_MASK); 1845 child = page_header(pte & PT64_BASE_ADDR_MASK);
1811 drop_parent_pte(child, spte); 1846 drop_parent_pte(child, spte);
1812 } 1847 }
1813 } else if (is_mmio_spte(pte)) 1848 return true;
1849 }
1850
1851 if (is_mmio_spte(pte))
1814 mmu_spte_clear_no_track(spte); 1852 mmu_spte_clear_no_track(spte);
1815 1853
1816 if (is_large_pte(pte)) 1854 return false;
1817 --kvm->stat.lpages;
1818} 1855}
1819 1856
1820static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1857static void kvm_mmu_page_unlink_children(struct kvm *kvm,
@@ -1831,15 +1868,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1831 mmu_page_remove_parent_pte(sp, parent_pte); 1868 mmu_page_remove_parent_pte(sp, parent_pte);
1832} 1869}
1833 1870
1834static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1835{
1836 int i;
1837 struct kvm_vcpu *vcpu;
1838
1839 kvm_for_each_vcpu(i, vcpu, kvm)
1840 vcpu->arch.last_pte_updated = NULL;
1841}
1842
1843static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1871static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1844{ 1872{
1845 u64 *parent_pte; 1873 u64 *parent_pte;
@@ -1899,7 +1927,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1899 } 1927 }
1900 1928
1901 sp->role.invalid = 1; 1929 sp->role.invalid = 1;
1902 kvm_mmu_reset_last_pte_updated(kvm);
1903 return ret; 1930 return ret;
1904} 1931}
1905 1932
@@ -1985,7 +2012,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1985 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2012 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1986} 2013}
1987 2014
1988static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2015int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1989{ 2016{
1990 struct kvm_mmu_page *sp; 2017 struct kvm_mmu_page *sp;
1991 struct hlist_node *node; 2018 struct hlist_node *node;
@@ -1994,7 +2021,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1994 2021
1995 pgprintk("%s: looking for gfn %llx\n", __func__, gfn); 2022 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1996 r = 0; 2023 r = 0;
1997 2024 spin_lock(&kvm->mmu_lock);
1998 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 2025 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1999 pgprintk("%s: gfn %llx role %x\n", __func__, gfn, 2026 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2000 sp->role.word); 2027 sp->role.word);
@@ -2002,22 +2029,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2002 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2029 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2003 } 2030 }
2004 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2031 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2005 return r; 2032 spin_unlock(&kvm->mmu_lock);
2006}
2007
2008static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
2009{
2010 struct kvm_mmu_page *sp;
2011 struct hlist_node *node;
2012 LIST_HEAD(invalid_list);
2013 2033
2014 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 2034 return r;
2015 pgprintk("%s: zap %llx %x\n",
2016 __func__, gfn, sp->role.word);
2017 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2018 }
2019 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2020} 2035}
2036EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2021 2037
2022static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 2038static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
2023{ 2039{
@@ -2169,8 +2185,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2169 return 1; 2185 return 1;
2170 2186
2171 if (!need_unsync && !s->unsync) { 2187 if (!need_unsync && !s->unsync) {
2172 if (!oos_shadow)
2173 return 1;
2174 need_unsync = true; 2188 need_unsync = true;
2175 } 2189 }
2176 } 2190 }
@@ -2191,11 +2205,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2191 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2205 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
2192 return 0; 2206 return 0;
2193 2207
2194 /*
2195 * We don't set the accessed bit, since we sometimes want to see
2196 * whether the guest actually used the pte (in order to detect
2197 * demand paging).
2198 */
2199 spte = PT_PRESENT_MASK; 2208 spte = PT_PRESENT_MASK;
2200 if (!speculative) 2209 if (!speculative)
2201 spte |= shadow_accessed_mask; 2210 spte |= shadow_accessed_mask;
@@ -2346,10 +2355,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2346 } 2355 }
2347 } 2356 }
2348 kvm_release_pfn_clean(pfn); 2357 kvm_release_pfn_clean(pfn);
2349 if (speculative) {
2350 vcpu->arch.last_pte_updated = sptep;
2351 vcpu->arch.last_pte_gfn = gfn;
2352 }
2353} 2358}
2354 2359
2355static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2360static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2840,12 +2845,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2840 return; 2845 return;
2841 2846
2842 vcpu_clear_mmio_info(vcpu, ~0ul); 2847 vcpu_clear_mmio_info(vcpu, ~0ul);
2843 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 2848 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2844 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 2849 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2845 hpa_t root = vcpu->arch.mmu.root_hpa; 2850 hpa_t root = vcpu->arch.mmu.root_hpa;
2846 sp = page_header(root); 2851 sp = page_header(root);
2847 mmu_sync_children(vcpu, sp); 2852 mmu_sync_children(vcpu, sp);
2848 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 2853 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2849 return; 2854 return;
2850 } 2855 }
2851 for (i = 0; i < 4; ++i) { 2856 for (i = 0; i < 4; ++i) {
@@ -2857,7 +2862,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2857 mmu_sync_children(vcpu, sp); 2862 mmu_sync_children(vcpu, sp);
2858 } 2863 }
2859 } 2864 }
2860 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 2865 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2861} 2866}
2862 2867
2863void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2868void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3510,28 +3515,119 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
3510 kvm_mmu_flush_tlb(vcpu); 3515 kvm_mmu_flush_tlb(vcpu);
3511} 3516}
3512 3517
3513static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) 3518static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3519 const u8 *new, int *bytes)
3514{ 3520{
3515 u64 *spte = vcpu->arch.last_pte_updated; 3521 u64 gentry;
3522 int r;
3523
3524 /*
3525 * Assume that the pte write on a page table of the same type
3526 * as the current vcpu paging mode since we update the sptes only
3527 * when they have the same mode.
3528 */
3529 if (is_pae(vcpu) && *bytes == 4) {
3530 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3531 *gpa &= ~(gpa_t)7;
3532 *bytes = 8;
3533 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
3534 if (r)
3535 gentry = 0;
3536 new = (const u8 *)&gentry;
3537 }
3516 3538
3517 return !!(spte && (*spte & shadow_accessed_mask)); 3539 switch (*bytes) {
3540 case 4:
3541 gentry = *(const u32 *)new;
3542 break;
3543 case 8:
3544 gentry = *(const u64 *)new;
3545 break;
3546 default:
3547 gentry = 0;
3548 break;
3549 }
3550
3551 return gentry;
3518} 3552}
3519 3553
3520static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3554/*
3555 * If we're seeing too many writes to a page, it may no longer be a page table,
3556 * or we may be forking, in which case it is better to unmap the page.
3557 */
3558static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
3521{ 3559{
3522 u64 *spte = vcpu->arch.last_pte_updated; 3560 /*
3561 * Skip write-flooding detected for the sp whose level is 1, because
3562 * it can become unsync, then the guest page is not write-protected.
3563 */
3564 if (sp->role.level == 1)
3565 return false;
3523 3566
3524 if (spte 3567 return ++sp->write_flooding_count >= 3;
3525 && vcpu->arch.last_pte_gfn == gfn 3568}
3526 && shadow_accessed_mask 3569
3527 && !(*spte & shadow_accessed_mask) 3570/*
3528 && is_shadow_present_pte(*spte)) 3571 * Misaligned accesses are too much trouble to fix up; also, they usually
3529 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 3572 * indicate a page is not used as a page table.
3573 */
3574static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
3575 int bytes)
3576{
3577 unsigned offset, pte_size, misaligned;
3578
3579 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3580 gpa, bytes, sp->role.word);
3581
3582 offset = offset_in_page(gpa);
3583 pte_size = sp->role.cr4_pae ? 8 : 4;
3584
3585 /*
3586 * Sometimes, the OS only writes the last one bytes to update status
3587 * bits, for example, in linux, andb instruction is used in clear_bit().
3588 */
3589 if (!(offset & (pte_size - 1)) && bytes == 1)
3590 return false;
3591
3592 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3593 misaligned |= bytes < 4;
3594
3595 return misaligned;
3596}
3597
3598static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
3599{
3600 unsigned page_offset, quadrant;
3601 u64 *spte;
3602 int level;
3603
3604 page_offset = offset_in_page(gpa);
3605 level = sp->role.level;
3606 *nspte = 1;
3607 if (!sp->role.cr4_pae) {
3608 page_offset <<= 1; /* 32->64 */
3609 /*
3610 * A 32-bit pde maps 4MB while the shadow pdes map
3611 * only 2MB. So we need to double the offset again
3612 * and zap two pdes instead of one.
3613 */
3614 if (level == PT32_ROOT_LEVEL) {
3615 page_offset &= ~7; /* kill rounding error */
3616 page_offset <<= 1;
3617 *nspte = 2;
3618 }
3619 quadrant = page_offset >> PAGE_SHIFT;
3620 page_offset &= ~PAGE_MASK;
3621 if (quadrant != sp->role.quadrant)
3622 return NULL;
3623 }
3624
3625 spte = &sp->spt[page_offset / sizeof(*spte)];
3626 return spte;
3530} 3627}
3531 3628
3532void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 3629void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3533 const u8 *new, int bytes, 3630 const u8 *new, int bytes)
3534 bool guest_initiated)
3535{ 3631{
3536 gfn_t gfn = gpa >> PAGE_SHIFT; 3632 gfn_t gfn = gpa >> PAGE_SHIFT;
3537 union kvm_mmu_page_role mask = { .word = 0 }; 3633 union kvm_mmu_page_role mask = { .word = 0 };
@@ -3539,8 +3635,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3539 struct hlist_node *node; 3635 struct hlist_node *node;
3540 LIST_HEAD(invalid_list); 3636 LIST_HEAD(invalid_list);
3541 u64 entry, gentry, *spte; 3637 u64 entry, gentry, *spte;
3542 unsigned pte_size, page_offset, misaligned, quadrant, offset; 3638 int npte;
3543 int level, npte, invlpg_counter, r, flooded = 0;
3544 bool remote_flush, local_flush, zap_page; 3639 bool remote_flush, local_flush, zap_page;
3545 3640
3546 /* 3641 /*
@@ -3551,112 +3646,45 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3551 return; 3646 return;
3552 3647
3553 zap_page = remote_flush = local_flush = false; 3648 zap_page = remote_flush = local_flush = false;
3554 offset = offset_in_page(gpa);
3555 3649
3556 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3650 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3557 3651
3558 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); 3652 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
3559 3653
3560 /* 3654 /*
3561 * Assume that the pte write on a page table of the same type 3655 * No need to care whether allocation memory is successful
3562 * as the current vcpu paging mode since we update the sptes only 3656 * or not since pte prefetch is skiped if it does not have
3563 * when they have the same mode. 3657 * enough objects in the cache.
3564 */ 3658 */
3565 if ((is_pae(vcpu) && bytes == 4) || !new) { 3659 mmu_topup_memory_caches(vcpu);
3566 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3567 if (is_pae(vcpu)) {
3568 gpa &= ~(gpa_t)7;
3569 bytes = 8;
3570 }
3571 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
3572 if (r)
3573 gentry = 0;
3574 new = (const u8 *)&gentry;
3575 }
3576
3577 switch (bytes) {
3578 case 4:
3579 gentry = *(const u32 *)new;
3580 break;
3581 case 8:
3582 gentry = *(const u64 *)new;
3583 break;
3584 default:
3585 gentry = 0;
3586 break;
3587 }
3588 3660
3589 spin_lock(&vcpu->kvm->mmu_lock); 3661 spin_lock(&vcpu->kvm->mmu_lock);
3590 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3591 gentry = 0;
3592 kvm_mmu_free_some_pages(vcpu);
3593 ++vcpu->kvm->stat.mmu_pte_write; 3662 ++vcpu->kvm->stat.mmu_pte_write;
3594 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 3663 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3595 if (guest_initiated) {
3596 kvm_mmu_access_page(vcpu, gfn);
3597 if (gfn == vcpu->arch.last_pt_write_gfn
3598 && !last_updated_pte_accessed(vcpu)) {
3599 ++vcpu->arch.last_pt_write_count;
3600 if (vcpu->arch.last_pt_write_count >= 3)
3601 flooded = 1;
3602 } else {
3603 vcpu->arch.last_pt_write_gfn = gfn;
3604 vcpu->arch.last_pt_write_count = 1;
3605 vcpu->arch.last_pte_updated = NULL;
3606 }
3607 }
3608 3664
3609 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; 3665 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3610 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { 3666 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3611 pte_size = sp->role.cr4_pae ? 8 : 4; 3667 spte = get_written_sptes(sp, gpa, &npte);
3612 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 3668
3613 misaligned |= bytes < 4; 3669 if (detect_write_misaligned(sp, gpa, bytes) ||
3614 if (misaligned || flooded) { 3670 detect_write_flooding(sp, spte)) {
3615 /*
3616 * Misaligned accesses are too much trouble to fix
3617 * up; also, they usually indicate a page is not used
3618 * as a page table.
3619 *
3620 * If we're seeing too many writes to a page,
3621 * it may no longer be a page table, or we may be
3622 * forking, in which case it is better to unmap the
3623 * page.
3624 */
3625 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3626 gpa, bytes, sp->role.word);
3627 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3671 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3628 &invalid_list); 3672 &invalid_list);
3629 ++vcpu->kvm->stat.mmu_flooded; 3673 ++vcpu->kvm->stat.mmu_flooded;
3630 continue; 3674 continue;
3631 } 3675 }
3632 page_offset = offset; 3676
3633 level = sp->role.level; 3677 spte = get_written_sptes(sp, gpa, &npte);
3634 npte = 1; 3678 if (!spte)
3635 if (!sp->role.cr4_pae) { 3679 continue;
3636 page_offset <<= 1; /* 32->64 */ 3680
3637 /*
3638 * A 32-bit pde maps 4MB while the shadow pdes map
3639 * only 2MB. So we need to double the offset again
3640 * and zap two pdes instead of one.
3641 */
3642 if (level == PT32_ROOT_LEVEL) {
3643 page_offset &= ~7; /* kill rounding error */
3644 page_offset <<= 1;
3645 npte = 2;
3646 }
3647 quadrant = page_offset >> PAGE_SHIFT;
3648 page_offset &= ~PAGE_MASK;
3649 if (quadrant != sp->role.quadrant)
3650 continue;
3651 }
3652 local_flush = true; 3681 local_flush = true;
3653 spte = &sp->spt[page_offset / sizeof(*spte)];
3654 while (npte--) { 3682 while (npte--) {
3655 entry = *spte; 3683 entry = *spte;
3656 mmu_page_zap_pte(vcpu->kvm, sp, spte); 3684 mmu_page_zap_pte(vcpu->kvm, sp, spte);
3657 if (gentry && 3685 if (gentry &&
3658 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3686 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3659 & mask.word)) 3687 & mask.word) && rmap_can_add(vcpu))
3660 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3688 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3661 if (!remote_flush && need_remote_flush(entry, *spte)) 3689 if (!remote_flush && need_remote_flush(entry, *spte))
3662 remote_flush = true; 3690 remote_flush = true;
@@ -3665,7 +3693,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3665 } 3693 }
3666 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3694 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
3667 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3695 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3668 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 3696 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3669 spin_unlock(&vcpu->kvm->mmu_lock); 3697 spin_unlock(&vcpu->kvm->mmu_lock);
3670} 3698}
3671 3699
@@ -3679,9 +3707,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
3679 3707
3680 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3708 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
3681 3709
3682 spin_lock(&vcpu->kvm->mmu_lock);
3683 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3710 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3684 spin_unlock(&vcpu->kvm->mmu_lock); 3711
3685 return r; 3712 return r;
3686} 3713}
3687EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 3714EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
@@ -3702,10 +3729,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3702 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3729 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3703} 3730}
3704 3731
3732static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
3733{
3734 if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
3735 return vcpu_match_mmio_gpa(vcpu, addr);
3736
3737 return vcpu_match_mmio_gva(vcpu, addr);
3738}
3739
3705int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, 3740int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3706 void *insn, int insn_len) 3741 void *insn, int insn_len)
3707{ 3742{
3708 int r; 3743 int r, emulation_type = EMULTYPE_RETRY;
3709 enum emulation_result er; 3744 enum emulation_result er;
3710 3745
3711 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); 3746 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
@@ -3717,11 +3752,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3717 goto out; 3752 goto out;
3718 } 3753 }
3719 3754
3720 r = mmu_topup_memory_caches(vcpu); 3755 if (is_mmio_page_fault(vcpu, cr2))
3721 if (r) 3756 emulation_type = 0;
3722 goto out;
3723 3757
3724 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); 3758 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
3725 3759
3726 switch (er) { 3760 switch (er) {
3727 case EMULATE_DONE: 3761 case EMULATE_DONE:
@@ -3792,7 +3826,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
3792int kvm_mmu_create(struct kvm_vcpu *vcpu) 3826int kvm_mmu_create(struct kvm_vcpu *vcpu)
3793{ 3827{
3794 ASSERT(vcpu); 3828 ASSERT(vcpu);
3795 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3829
3830 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
3831 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3832 vcpu->arch.mmu.translate_gpa = translate_gpa;
3833 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
3796 3834
3797 return alloc_mmu_pages(vcpu); 3835 return alloc_mmu_pages(vcpu);
3798} 3836}
@@ -3852,14 +3890,14 @@ restart:
3852 spin_unlock(&kvm->mmu_lock); 3890 spin_unlock(&kvm->mmu_lock);
3853} 3891}
3854 3892
3855static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, 3893static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3856 struct list_head *invalid_list) 3894 struct list_head *invalid_list)
3857{ 3895{
3858 struct kvm_mmu_page *page; 3896 struct kvm_mmu_page *page;
3859 3897
3860 page = container_of(kvm->arch.active_mmu_pages.prev, 3898 page = container_of(kvm->arch.active_mmu_pages.prev,
3861 struct kvm_mmu_page, link); 3899 struct kvm_mmu_page, link);
3862 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3900 kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3863} 3901}
3864 3902
3865static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 3903static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -3874,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3874 raw_spin_lock(&kvm_lock); 3912 raw_spin_lock(&kvm_lock);
3875 3913
3876 list_for_each_entry(kvm, &vm_list, vm_list) { 3914 list_for_each_entry(kvm, &vm_list, vm_list) {
3877 int idx, freed_pages; 3915 int idx;
3878 LIST_HEAD(invalid_list); 3916 LIST_HEAD(invalid_list);
3879 3917
3880 idx = srcu_read_lock(&kvm->srcu); 3918 idx = srcu_read_lock(&kvm->srcu);
3881 spin_lock(&kvm->mmu_lock); 3919 spin_lock(&kvm->mmu_lock);
3882 if (!kvm_freed && nr_to_scan > 0 && 3920 if (!kvm_freed && nr_to_scan > 0 &&
3883 kvm->arch.n_used_mmu_pages > 0) { 3921 kvm->arch.n_used_mmu_pages > 0) {
3884 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3922 kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3885 &invalid_list); 3923 &invalid_list);
3886 kvm_freed = kvm; 3924 kvm_freed = kvm;
3887 } 3925 }
3888 nr_to_scan--; 3926 nr_to_scan--;
@@ -3944,15 +3982,15 @@ nomem:
3944 */ 3982 */
3945unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) 3983unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3946{ 3984{
3947 int i;
3948 unsigned int nr_mmu_pages; 3985 unsigned int nr_mmu_pages;
3949 unsigned int nr_pages = 0; 3986 unsigned int nr_pages = 0;
3950 struct kvm_memslots *slots; 3987 struct kvm_memslots *slots;
3988 struct kvm_memory_slot *memslot;
3951 3989
3952 slots = kvm_memslots(kvm); 3990 slots = kvm_memslots(kvm);
3953 3991
3954 for (i = 0; i < slots->nmemslots; i++) 3992 kvm_for_each_memslot(memslot, slots)
3955 nr_pages += slots->memslots[i].npages; 3993 nr_pages += memslot->npages;
3956 3994
3957 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3995 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3958 nr_mmu_pages = max(nr_mmu_pages, 3996 nr_mmu_pages = max(nr_mmu_pages,
@@ -3961,127 +3999,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3961 return nr_mmu_pages; 3999 return nr_mmu_pages;
3962} 4000}
3963 4001
3964static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3965 unsigned len)
3966{
3967 if (len > buffer->len)
3968 return NULL;
3969 return buffer->ptr;
3970}
3971
3972static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3973 unsigned len)
3974{
3975 void *ret;
3976
3977 ret = pv_mmu_peek_buffer(buffer, len);
3978 if (!ret)
3979 return ret;
3980 buffer->ptr += len;
3981 buffer->len -= len;
3982 buffer->processed += len;
3983 return ret;
3984}
3985
3986static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3987 gpa_t addr, gpa_t value)
3988{
3989 int bytes = 8;
3990 int r;
3991
3992 if (!is_long_mode(vcpu) && !is_pae(vcpu))
3993 bytes = 4;
3994
3995 r = mmu_topup_memory_caches(vcpu);
3996 if (r)
3997 return r;
3998
3999 if (!emulator_write_phys(vcpu, addr, &value, bytes))
4000 return -EFAULT;
4001
4002 return 1;
4003}
4004
4005static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
4006{
4007 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
4008 return 1;
4009}
4010
4011static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
4012{
4013 spin_lock(&vcpu->kvm->mmu_lock);
4014 mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
4015 spin_unlock(&vcpu->kvm->mmu_lock);
4016 return 1;
4017}
4018
4019static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
4020 struct kvm_pv_mmu_op_buffer *buffer)
4021{
4022 struct kvm_mmu_op_header *header;
4023
4024 header = pv_mmu_peek_buffer(buffer, sizeof *header);
4025 if (!header)
4026 return 0;
4027 switch (header->op) {
4028 case KVM_MMU_OP_WRITE_PTE: {
4029 struct kvm_mmu_op_write_pte *wpte;
4030
4031 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
4032 if (!wpte)
4033 return 0;
4034 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
4035 wpte->pte_val);
4036 }
4037 case KVM_MMU_OP_FLUSH_TLB: {
4038 struct kvm_mmu_op_flush_tlb *ftlb;
4039
4040 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
4041 if (!ftlb)
4042 return 0;
4043 return kvm_pv_mmu_flush_tlb(vcpu);
4044 }
4045 case KVM_MMU_OP_RELEASE_PT: {
4046 struct kvm_mmu_op_release_pt *rpt;
4047
4048 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
4049 if (!rpt)
4050 return 0;
4051 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
4052 }
4053 default: return 0;
4054 }
4055}
4056
4057int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
4058 gpa_t addr, unsigned long *ret)
4059{
4060 int r;
4061 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
4062
4063 buffer->ptr = buffer->buf;
4064 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
4065 buffer->processed = 0;
4066
4067 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
4068 if (r)
4069 goto out;
4070
4071 while (buffer->len) {
4072 r = kvm_pv_mmu_op_one(vcpu, buffer);
4073 if (r < 0)
4074 goto out;
4075 if (r == 0)
4076 break;
4077 }
4078
4079 r = 1;
4080out:
4081 *ret = buffer->processed;
4082 return r;
4083}
4084
4085int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) 4002int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
4086{ 4003{
4087 struct kvm_shadow_walk_iterator iterator; 4004 struct kvm_shadow_walk_iterator iterator;
@@ -4110,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
4110 mmu_free_memory_caches(vcpu); 4027 mmu_free_memory_caches(vcpu);
4111} 4028}
4112 4029
4113#ifdef CONFIG_KVM_MMU_AUDIT
4114#include "mmu_audit.c"
4115#else
4116static void mmu_audit_disable(void) { }
4117#endif
4118
4119void kvm_mmu_module_exit(void) 4030void kvm_mmu_module_exit(void)
4120{ 4031{
4121 mmu_destroy_caches(); 4032 mmu_destroy_caches();
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 746ec259d024..fe15dcc07a6b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,6 +19,15 @@
19 19
20#include <linux/ratelimit.h> 20#include <linux/ratelimit.h>
21 21
22char const *audit_point_name[] = {
23 "pre page fault",
24 "post page fault",
25 "pre pte write",
26 "post pte write",
27 "pre sync",
28 "post sync"
29};
30
22#define audit_printk(kvm, fmt, args...) \ 31#define audit_printk(kvm, fmt, args...) \
23 printk(KERN_ERR "audit: (%s) error: " \ 32 printk(KERN_ERR "audit: (%s) error: " \
24 fmt, audit_point_name[kvm->arch.audit_point], ##args) 33 fmt, audit_point_name[kvm->arch.audit_point], ##args)
@@ -224,7 +233,10 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
224 mmu_spte_walk(vcpu, audit_spte); 233 mmu_spte_walk(vcpu, audit_spte);
225} 234}
226 235
227static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) 236static bool mmu_audit;
237static struct jump_label_key mmu_audit_key;
238
239static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
228{ 240{
229 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); 241 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
230 242
@@ -236,18 +248,18 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
236 audit_vcpu_spte(vcpu); 248 audit_vcpu_spte(vcpu);
237} 249}
238 250
239static bool mmu_audit; 251static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
252{
253 if (static_branch((&mmu_audit_key)))
254 __kvm_mmu_audit(vcpu, point);
255}
240 256
241static void mmu_audit_enable(void) 257static void mmu_audit_enable(void)
242{ 258{
243 int ret;
244
245 if (mmu_audit) 259 if (mmu_audit)
246 return; 260 return;
247 261
248 ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); 262 jump_label_inc(&mmu_audit_key);
249 WARN_ON(ret);
250
251 mmu_audit = true; 263 mmu_audit = true;
252} 264}
253 265
@@ -256,8 +268,7 @@ static void mmu_audit_disable(void)
256 if (!mmu_audit) 268 if (!mmu_audit)
257 return; 269 return;
258 270
259 unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); 271 jump_label_dec(&mmu_audit_key);
260 tracepoint_synchronize_unregister();
261 mmu_audit = false; 272 mmu_audit = false;
262} 273}
263 274
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index eed67f34146d..89fb0e81322a 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,25 +243,6 @@ TRACE_EVENT(
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, 243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access) 244 __entry->access)
245); 245);
246
247TRACE_EVENT(
248 kvm_mmu_audit,
249 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
250 TP_ARGS(vcpu, audit_point),
251
252 TP_STRUCT__entry(
253 __field(struct kvm_vcpu *, vcpu)
254 __field(int, audit_point)
255 ),
256
257 TP_fast_assign(
258 __entry->vcpu = vcpu;
259 __entry->audit_point = audit_point;
260 ),
261
262 TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
263 audit_point_name[__entry->audit_point])
264);
265#endif /* _TRACE_KVMMMU_H */ 246#endif /* _TRACE_KVMMMU_H */
266 247
267#undef TRACE_INCLUDE_PATH 248#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 92994100638b..15610285ebb6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -497,6 +497,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
497 shadow_walk_next(&it)) { 497 shadow_walk_next(&it)) {
498 gfn_t table_gfn; 498 gfn_t table_gfn;
499 499
500 clear_sp_write_flooding_count(it.sptep);
500 drop_large_spte(vcpu, it.sptep); 501 drop_large_spte(vcpu, it.sptep);
501 502
502 sp = NULL; 503 sp = NULL;
@@ -522,6 +523,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
522 shadow_walk_next(&it)) { 523 shadow_walk_next(&it)) {
523 gfn_t direct_gfn; 524 gfn_t direct_gfn;
524 525
526 clear_sp_write_flooding_count(it.sptep);
525 validate_direct_spte(vcpu, it.sptep, direct_access); 527 validate_direct_spte(vcpu, it.sptep, direct_access);
526 528
527 drop_large_spte(vcpu, it.sptep); 529 drop_large_spte(vcpu, it.sptep);
@@ -536,6 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
536 link_shadow_page(it.sptep, sp); 538 link_shadow_page(it.sptep, sp);
537 } 539 }
538 540
541 clear_sp_write_flooding_count(it.sptep);
539 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 542 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
540 user_fault, write_fault, emulate, it.level, 543 user_fault, write_fault, emulate, it.level,
541 gw->gfn, pfn, prefault, map_writable); 544 gw->gfn, pfn, prefault, map_writable);
@@ -599,11 +602,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
599 */ 602 */
600 if (!r) { 603 if (!r) {
601 pgprintk("%s: guest page fault\n", __func__); 604 pgprintk("%s: guest page fault\n", __func__);
602 if (!prefault) { 605 if (!prefault)
603 inject_page_fault(vcpu, &walker.fault); 606 inject_page_fault(vcpu, &walker.fault);
604 /* reset fork detector */ 607
605 vcpu->arch.last_pt_write_count = 0;
606 }
607 return 0; 608 return 0;
608 } 609 }
609 610
@@ -631,7 +632,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
631 if (mmu_notifier_retry(vcpu, mmu_seq)) 632 if (mmu_notifier_retry(vcpu, mmu_seq))
632 goto out_unlock; 633 goto out_unlock;
633 634
634 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 635 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
635 kvm_mmu_free_some_pages(vcpu); 636 kvm_mmu_free_some_pages(vcpu);
636 if (!force_pt_level) 637 if (!force_pt_level)
637 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 638 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
@@ -641,11 +642,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
641 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, 642 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
642 sptep, *sptep, emulate); 643 sptep, *sptep, emulate);
643 644
644 if (!emulate)
645 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
646
647 ++vcpu->stat.pf_fixed; 645 ++vcpu->stat.pf_fixed;
648 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 646 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
649 spin_unlock(&vcpu->kvm->mmu_lock); 647 spin_unlock(&vcpu->kvm->mmu_lock);
650 648
651 return emulate; 649 return emulate;
@@ -656,65 +654,66 @@ out_unlock:
656 return 0; 654 return 0;
657} 655}
658 656
657static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
658{
659 int offset = 0;
660
661 WARN_ON(sp->role.level != 1);
662
663 if (PTTYPE == 32)
664 offset = sp->role.quadrant << PT64_LEVEL_BITS;
665
666 return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
667}
668
659static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 669static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
660{ 670{
661 struct kvm_shadow_walk_iterator iterator; 671 struct kvm_shadow_walk_iterator iterator;
662 struct kvm_mmu_page *sp; 672 struct kvm_mmu_page *sp;
663 gpa_t pte_gpa = -1;
664 int level; 673 int level;
665 u64 *sptep; 674 u64 *sptep;
666 int need_flush = 0;
667 675
668 vcpu_clear_mmio_info(vcpu, gva); 676 vcpu_clear_mmio_info(vcpu, gva);
669 677
670 spin_lock(&vcpu->kvm->mmu_lock); 678 /*
679 * No need to check return value here, rmap_can_add() can
680 * help us to skip pte prefetch later.
681 */
682 mmu_topup_memory_caches(vcpu);
671 683
684 spin_lock(&vcpu->kvm->mmu_lock);
672 for_each_shadow_entry(vcpu, gva, iterator) { 685 for_each_shadow_entry(vcpu, gva, iterator) {
673 level = iterator.level; 686 level = iterator.level;
674 sptep = iterator.sptep; 687 sptep = iterator.sptep;
675 688
676 sp = page_header(__pa(sptep)); 689 sp = page_header(__pa(sptep));
677 if (is_last_spte(*sptep, level)) { 690 if (is_last_spte(*sptep, level)) {
678 int offset, shift; 691 pt_element_t gpte;
692 gpa_t pte_gpa;
679 693
680 if (!sp->unsync) 694 if (!sp->unsync)
681 break; 695 break;
682 696
683 shift = PAGE_SHIFT - 697 pte_gpa = FNAME(get_level1_sp_gpa)(sp);
684 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
685 offset = sp->role.quadrant << shift;
686
687 pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
688 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 698 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
689 699
690 if (is_shadow_present_pte(*sptep)) { 700 if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
691 if (is_large_pte(*sptep)) 701 kvm_flush_remote_tlbs(vcpu->kvm);
692 --vcpu->kvm->stat.lpages;
693 drop_spte(vcpu->kvm, sptep);
694 need_flush = 1;
695 } else if (is_mmio_spte(*sptep))
696 mmu_spte_clear_no_track(sptep);
697 702
698 break; 703 if (!rmap_can_add(vcpu))
704 break;
705
706 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
707 sizeof(pt_element_t)))
708 break;
709
710 FNAME(update_pte)(vcpu, sp, sptep, &gpte);
699 } 711 }
700 712
701 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) 713 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
702 break; 714 break;
703 } 715 }
704
705 if (need_flush)
706 kvm_flush_remote_tlbs(vcpu->kvm);
707
708 atomic_inc(&vcpu->kvm->arch.invlpg_counter);
709
710 spin_unlock(&vcpu->kvm->mmu_lock); 716 spin_unlock(&vcpu->kvm->mmu_lock);
711
712 if (pte_gpa == -1)
713 return;
714
715 if (mmu_topup_memory_caches(vcpu))
716 return;
717 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
718} 717}
719 718
720static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 719static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -769,19 +768,14 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
769 */ 768 */
770static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 769static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
771{ 770{
772 int i, offset, nr_present; 771 int i, nr_present = 0;
773 bool host_writable; 772 bool host_writable;
774 gpa_t first_pte_gpa; 773 gpa_t first_pte_gpa;
775 774
776 offset = nr_present = 0;
777
778 /* direct kvm_mmu_page can not be unsync. */ 775 /* direct kvm_mmu_page can not be unsync. */
779 BUG_ON(sp->role.direct); 776 BUG_ON(sp->role.direct);
780 777
781 if (PTTYPE == 32) 778 first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
782 offset = sp->role.quadrant << PT64_LEVEL_BITS;
783
784 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
785 779
786 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 780 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
787 unsigned pte_access; 781 unsigned pte_access;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000000..7aad5446f393
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,533 @@
1/*
2 * Kernel-based Virtual Machine -- Performane Monitoring Unit support
3 *
4 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
5 *
6 * Authors:
7 * Avi Kivity <avi@redhat.com>
8 * Gleb Natapov <gleb@redhat.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
12 *
13 */
14
15#include <linux/types.h>
16#include <linux/kvm_host.h>
17#include <linux/perf_event.h>
18#include "x86.h"
19#include "cpuid.h"
20#include "lapic.h"
21
22static struct kvm_arch_event_perf_mapping {
23 u8 eventsel;
24 u8 unit_mask;
25 unsigned event_type;
26 bool inexact;
27} arch_events[] = {
28 /* Index must match CPUID 0x0A.EBX bit vector */
29 [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
30 [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
31 [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
32 [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
33 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
34 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
35 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
36};
37
38/* mapping between fixed pmc index and arch_events array */
39int fixed_pmc_events[] = {1, 0, 2};
40
41static bool pmc_is_gp(struct kvm_pmc *pmc)
42{
43 return pmc->type == KVM_PMC_GP;
44}
45
46static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
47{
48 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
49
50 return pmu->counter_bitmask[pmc->type];
51}
52
53static inline bool pmc_enabled(struct kvm_pmc *pmc)
54{
55 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
56 return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
57}
58
59static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
60 u32 base)
61{
62 if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
63 return &pmu->gp_counters[msr - base];
64 return NULL;
65}
66
67static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
68{
69 int base = MSR_CORE_PERF_FIXED_CTR0;
70 if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
71 return &pmu->fixed_counters[msr - base];
72 return NULL;
73}
74
75static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
76{
77 return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
78}
79
80static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
81{
82 if (idx < X86_PMC_IDX_FIXED)
83 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
84 else
85 return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
86}
87
88void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
89{
90 if (vcpu->arch.apic)
91 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
92}
93
94static void trigger_pmi(struct irq_work *irq_work)
95{
96 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
97 irq_work);
98 struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
99 arch.pmu);
100
101 kvm_deliver_pmi(vcpu);
102}
103
104static void kvm_perf_overflow(struct perf_event *perf_event,
105 struct perf_sample_data *data,
106 struct pt_regs *regs)
107{
108 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
109 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
110 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
111}
112
113static void kvm_perf_overflow_intr(struct perf_event *perf_event,
114 struct perf_sample_data *data, struct pt_regs *regs)
115{
116 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
117 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
118 if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
119 kvm_perf_overflow(perf_event, data, regs);
120 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
121 /*
122 * Inject PMI. If vcpu was in a guest mode during NMI PMI
123 * can be ejected on a guest mode re-entry. Otherwise we can't
124 * be sure that vcpu wasn't executing hlt instruction at the
125 * time of vmexit and is not going to re-enter guest mode until,
126 * woken up. So we should wake it, but this is impossible from
127 * NMI context. Do it from irq work instead.
128 */
129 if (!kvm_is_in_guest())
130 irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
131 else
132 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
133 }
134}
135
136static u64 read_pmc(struct kvm_pmc *pmc)
137{
138 u64 counter, enabled, running;
139
140 counter = pmc->counter;
141
142 if (pmc->perf_event)
143 counter += perf_event_read_value(pmc->perf_event,
144 &enabled, &running);
145
146 /* FIXME: Scaling needed? */
147
148 return counter & pmc_bitmask(pmc);
149}
150
151static void stop_counter(struct kvm_pmc *pmc)
152{
153 if (pmc->perf_event) {
154 pmc->counter = read_pmc(pmc);
155 perf_event_release_kernel(pmc->perf_event);
156 pmc->perf_event = NULL;
157 }
158}
159
160static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
161 unsigned config, bool exclude_user, bool exclude_kernel,
162 bool intr)
163{
164 struct perf_event *event;
165 struct perf_event_attr attr = {
166 .type = type,
167 .size = sizeof(attr),
168 .pinned = true,
169 .exclude_idle = true,
170 .exclude_host = 1,
171 .exclude_user = exclude_user,
172 .exclude_kernel = exclude_kernel,
173 .config = config,
174 };
175
176 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
177
178 event = perf_event_create_kernel_counter(&attr, -1, current,
179 intr ? kvm_perf_overflow_intr :
180 kvm_perf_overflow, pmc);
181 if (IS_ERR(event)) {
182 printk_once("kvm: pmu event creation failed %ld\n",
183 PTR_ERR(event));
184 return;
185 }
186
187 pmc->perf_event = event;
188 clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
189}
190
191static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
192 u8 unit_mask)
193{
194 int i;
195
196 for (i = 0; i < ARRAY_SIZE(arch_events); i++)
197 if (arch_events[i].eventsel == event_select
198 && arch_events[i].unit_mask == unit_mask
199 && (pmu->available_event_types & (1 << i)))
200 break;
201
202 if (i == ARRAY_SIZE(arch_events))
203 return PERF_COUNT_HW_MAX;
204
205 return arch_events[i].event_type;
206}
207
208static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
209{
210 unsigned config, type = PERF_TYPE_RAW;
211 u8 event_select, unit_mask;
212
213 pmc->eventsel = eventsel;
214
215 stop_counter(pmc);
216
217 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
218 return;
219
220 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
221 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
222
223 if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
224 ARCH_PERFMON_EVENTSEL_INV |
225 ARCH_PERFMON_EVENTSEL_CMASK))) {
226 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
227 unit_mask);
228 if (config != PERF_COUNT_HW_MAX)
229 type = PERF_TYPE_HARDWARE;
230 }
231
232 if (type == PERF_TYPE_RAW)
233 config = eventsel & X86_RAW_EVENT_MASK;
234
235 reprogram_counter(pmc, type, config,
236 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
237 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
238 eventsel & ARCH_PERFMON_EVENTSEL_INT);
239}
240
241static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
242{
243 unsigned en = en_pmi & 0x3;
244 bool pmi = en_pmi & 0x8;
245
246 stop_counter(pmc);
247
248 if (!en || !pmc_enabled(pmc))
249 return;
250
251 reprogram_counter(pmc, PERF_TYPE_HARDWARE,
252 arch_events[fixed_pmc_events[idx]].event_type,
253 !(en & 0x2), /* exclude user */
254 !(en & 0x1), /* exclude kernel */
255 pmi);
256}
257
258static inline u8 fixed_en_pmi(u64 ctrl, int idx)
259{
260 return (ctrl >> (idx * 4)) & 0xf;
261}
262
263static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
264{
265 int i;
266
267 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
268 u8 en_pmi = fixed_en_pmi(data, i);
269 struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
270
271 if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
272 continue;
273
274 reprogram_fixed_counter(pmc, en_pmi, i);
275 }
276
277 pmu->fixed_ctr_ctrl = data;
278}
279
280static void reprogram_idx(struct kvm_pmu *pmu, int idx)
281{
282 struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
283
284 if (!pmc)
285 return;
286
287 if (pmc_is_gp(pmc))
288 reprogram_gp_counter(pmc, pmc->eventsel);
289 else {
290 int fidx = idx - X86_PMC_IDX_FIXED;
291 reprogram_fixed_counter(pmc,
292 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
293 }
294}
295
296static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
297{
298 int bit;
299 u64 diff = pmu->global_ctrl ^ data;
300
301 pmu->global_ctrl = data;
302
303 for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
304 reprogram_idx(pmu, bit);
305}
306
307bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
308{
309 struct kvm_pmu *pmu = &vcpu->arch.pmu;
310 int ret;
311
312 switch (msr) {
313 case MSR_CORE_PERF_FIXED_CTR_CTRL:
314 case MSR_CORE_PERF_GLOBAL_STATUS:
315 case MSR_CORE_PERF_GLOBAL_CTRL:
316 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
317 ret = pmu->version > 1;
318 break;
319 default:
320 ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
321 || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
322 || get_fixed_pmc(pmu, msr);
323 break;
324 }
325 return ret;
326}
327
328int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
329{
330 struct kvm_pmu *pmu = &vcpu->arch.pmu;
331 struct kvm_pmc *pmc;
332
333 switch (index) {
334 case MSR_CORE_PERF_FIXED_CTR_CTRL:
335 *data = pmu->fixed_ctr_ctrl;
336 return 0;
337 case MSR_CORE_PERF_GLOBAL_STATUS:
338 *data = pmu->global_status;
339 return 0;
340 case MSR_CORE_PERF_GLOBAL_CTRL:
341 *data = pmu->global_ctrl;
342 return 0;
343 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
344 *data = pmu->global_ovf_ctrl;
345 return 0;
346 default:
347 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
348 (pmc = get_fixed_pmc(pmu, index))) {
349 *data = read_pmc(pmc);
350 return 0;
351 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
352 *data = pmc->eventsel;
353 return 0;
354 }
355 }
356 return 1;
357}
358
359int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
360{
361 struct kvm_pmu *pmu = &vcpu->arch.pmu;
362 struct kvm_pmc *pmc;
363
364 switch (index) {
365 case MSR_CORE_PERF_FIXED_CTR_CTRL:
366 if (pmu->fixed_ctr_ctrl == data)
367 return 0;
368 if (!(data & 0xfffffffffffff444)) {
369 reprogram_fixed_counters(pmu, data);
370 return 0;
371 }
372 break;
373 case MSR_CORE_PERF_GLOBAL_STATUS:
374 break; /* RO MSR */
375 case MSR_CORE_PERF_GLOBAL_CTRL:
376 if (pmu->global_ctrl == data)
377 return 0;
378 if (!(data & pmu->global_ctrl_mask)) {
379 global_ctrl_changed(pmu, data);
380 return 0;
381 }
382 break;
383 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
384 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
385 pmu->global_status &= ~data;
386 pmu->global_ovf_ctrl = data;
387 return 0;
388 }
389 break;
390 default:
391 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
392 (pmc = get_fixed_pmc(pmu, index))) {
393 data = (s64)(s32)data;
394 pmc->counter += data - read_pmc(pmc);
395 return 0;
396 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
397 if (data == pmc->eventsel)
398 return 0;
399 if (!(data & 0xffffffff00200000ull)) {
400 reprogram_gp_counter(pmc, data);
401 return 0;
402 }
403 }
404 }
405 return 1;
406}
407
408int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
409{
410 struct kvm_pmu *pmu = &vcpu->arch.pmu;
411 bool fast_mode = pmc & (1u << 31);
412 bool fixed = pmc & (1u << 30);
413 struct kvm_pmc *counters;
414 u64 ctr;
415
416 pmc &= (3u << 30) - 1;
417 if (!fixed && pmc >= pmu->nr_arch_gp_counters)
418 return 1;
419 if (fixed && pmc >= pmu->nr_arch_fixed_counters)
420 return 1;
421 counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
422 ctr = read_pmc(&counters[pmc]);
423 if (fast_mode)
424 ctr = (u32)ctr;
425 *data = ctr;
426
427 return 0;
428}
429
430void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
431{
432 struct kvm_pmu *pmu = &vcpu->arch.pmu;
433 struct kvm_cpuid_entry2 *entry;
434 unsigned bitmap_len;
435
436 pmu->nr_arch_gp_counters = 0;
437 pmu->nr_arch_fixed_counters = 0;
438 pmu->counter_bitmask[KVM_PMC_GP] = 0;
439 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
440 pmu->version = 0;
441
442 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
443 if (!entry)
444 return;
445
446 pmu->version = entry->eax & 0xff;
447 if (!pmu->version)
448 return;
449
450 pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
451 X86_PMC_MAX_GENERIC);
452 pmu->counter_bitmask[KVM_PMC_GP] =
453 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
454 bitmap_len = (entry->eax >> 24) & 0xff;
455 pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
456
457 if (pmu->version == 1) {
458 pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1;
459 return;
460 }
461
462 pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
463 X86_PMC_MAX_FIXED);
464 pmu->counter_bitmask[KVM_PMC_FIXED] =
465 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
466 pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1)
467 | (((1ull << pmu->nr_arch_fixed_counters) - 1)
468 << X86_PMC_IDX_FIXED));
469}
470
471void kvm_pmu_init(struct kvm_vcpu *vcpu)
472{
473 int i;
474 struct kvm_pmu *pmu = &vcpu->arch.pmu;
475
476 memset(pmu, 0, sizeof(*pmu));
477 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
478 pmu->gp_counters[i].type = KVM_PMC_GP;
479 pmu->gp_counters[i].vcpu = vcpu;
480 pmu->gp_counters[i].idx = i;
481 }
482 for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
483 pmu->fixed_counters[i].type = KVM_PMC_FIXED;
484 pmu->fixed_counters[i].vcpu = vcpu;
485 pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
486 }
487 init_irq_work(&pmu->irq_work, trigger_pmi);
488 kvm_pmu_cpuid_update(vcpu);
489}
490
491void kvm_pmu_reset(struct kvm_vcpu *vcpu)
492{
493 struct kvm_pmu *pmu = &vcpu->arch.pmu;
494 int i;
495
496 irq_work_sync(&pmu->irq_work);
497 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
498 struct kvm_pmc *pmc = &pmu->gp_counters[i];
499 stop_counter(pmc);
500 pmc->counter = pmc->eventsel = 0;
501 }
502
503 for (i = 0; i < X86_PMC_MAX_FIXED; i++)
504 stop_counter(&pmu->fixed_counters[i]);
505
506 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
507 pmu->global_ovf_ctrl = 0;
508}
509
510void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
511{
512 kvm_pmu_reset(vcpu);
513}
514
515void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
516{
517 struct kvm_pmu *pmu = &vcpu->arch.pmu;
518 u64 bitmask;
519 int bit;
520
521 bitmask = pmu->reprogram_pmi;
522
523 for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
524 struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
525
526 if (unlikely(!pmc || !pmc->perf_event)) {
527 clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
528 continue;
529 }
530
531 reprogram_idx(pmu, bit);
532 }
533}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e32243eac2f4..5fa553babe56 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1014,6 +1014,7 @@ static void init_vmcb(struct vcpu_svm *svm)
1014 set_intercept(svm, INTERCEPT_NMI); 1014 set_intercept(svm, INTERCEPT_NMI);
1015 set_intercept(svm, INTERCEPT_SMI); 1015 set_intercept(svm, INTERCEPT_SMI);
1016 set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1016 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1017 set_intercept(svm, INTERCEPT_RDPMC);
1017 set_intercept(svm, INTERCEPT_CPUID); 1018 set_intercept(svm, INTERCEPT_CPUID);
1018 set_intercept(svm, INTERCEPT_INVD); 1019 set_intercept(svm, INTERCEPT_INVD);
1019 set_intercept(svm, INTERCEPT_HLT); 1020 set_intercept(svm, INTERCEPT_HLT);
@@ -2770,6 +2771,19 @@ static int emulate_on_interception(struct vcpu_svm *svm)
2770 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 2771 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2771} 2772}
2772 2773
2774static int rdpmc_interception(struct vcpu_svm *svm)
2775{
2776 int err;
2777
2778 if (!static_cpu_has(X86_FEATURE_NRIPS))
2779 return emulate_on_interception(svm);
2780
2781 err = kvm_rdpmc(&svm->vcpu);
2782 kvm_complete_insn_gp(&svm->vcpu, err);
2783
2784 return 1;
2785}
2786
2773bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) 2787bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
2774{ 2788{
2775 unsigned long cr0 = svm->vcpu.arch.cr0; 2789 unsigned long cr0 = svm->vcpu.arch.cr0;
@@ -3190,6 +3204,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
3190 [SVM_EXIT_SMI] = nop_on_interception, 3204 [SVM_EXIT_SMI] = nop_on_interception,
3191 [SVM_EXIT_INIT] = nop_on_interception, 3205 [SVM_EXIT_INIT] = nop_on_interception,
3192 [SVM_EXIT_VINTR] = interrupt_window_interception, 3206 [SVM_EXIT_VINTR] = interrupt_window_interception,
3207 [SVM_EXIT_RDPMC] = rdpmc_interception,
3193 [SVM_EXIT_CPUID] = cpuid_interception, 3208 [SVM_EXIT_CPUID] = cpuid_interception,
3194 [SVM_EXIT_IRET] = iret_interception, 3209 [SVM_EXIT_IRET] = iret_interception,
3195 [SVM_EXIT_INVD] = emulate_on_interception, 3210 [SVM_EXIT_INVD] = emulate_on_interception,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index ae432ea1cd83..6b85cc647f34 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -18,9 +18,10 @@
18#include <linux/atomic.h> 18#include <linux/atomic.h>
19#include "kvm_timer.h" 19#include "kvm_timer.h"
20 20
21static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) 21enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
22{ 22{
23 int restart_timer = 0; 23 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
24 struct kvm_vcpu *vcpu = ktimer->vcpu;
24 wait_queue_head_t *q = &vcpu->wq; 25 wait_queue_head_t *q = &vcpu->wq;
25 26
26 /* 27 /*
@@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
40 41
41 if (ktimer->t_ops->is_periodic(ktimer)) { 42 if (ktimer->t_ops->is_periodic(ktimer)) {
42 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); 43 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
43 restart_timer = 1;
44 }
45
46 return restart_timer;
47}
48
49enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
50{
51 int restart_timer;
52 struct kvm_vcpu *vcpu;
53 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
54
55 vcpu = ktimer->vcpu;
56 if (!vcpu)
57 return HRTIMER_NORESTART;
58
59 restart_timer = __kvm_timer_fn(vcpu, ktimer);
60 if (restart_timer)
61 return HRTIMER_RESTART; 44 return HRTIMER_RESTART;
62 else 45 } else
63 return HRTIMER_NORESTART; 46 return HRTIMER_NORESTART;
64} 47}
65
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 579a0b51696a..d29216c462b3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -18,6 +18,7 @@
18 18
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "cpuid.h"
21 22
22#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
23#include <linux/module.h> 24#include <linux/module.h>
@@ -50,29 +51,29 @@
50MODULE_AUTHOR("Qumranet"); 51MODULE_AUTHOR("Qumranet");
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
52 53
53static int __read_mostly enable_vpid = 1; 54static bool __read_mostly enable_vpid = 1;
54module_param_named(vpid, enable_vpid, bool, 0444); 55module_param_named(vpid, enable_vpid, bool, 0444);
55 56
56static int __read_mostly flexpriority_enabled = 1; 57static bool __read_mostly flexpriority_enabled = 1;
57module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 58module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
58 59
59static int __read_mostly enable_ept = 1; 60static bool __read_mostly enable_ept = 1;
60module_param_named(ept, enable_ept, bool, S_IRUGO); 61module_param_named(ept, enable_ept, bool, S_IRUGO);
61 62
62static int __read_mostly enable_unrestricted_guest = 1; 63static bool __read_mostly enable_unrestricted_guest = 1;
63module_param_named(unrestricted_guest, 64module_param_named(unrestricted_guest,
64 enable_unrestricted_guest, bool, S_IRUGO); 65 enable_unrestricted_guest, bool, S_IRUGO);
65 66
66static int __read_mostly emulate_invalid_guest_state = 0; 67static bool __read_mostly emulate_invalid_guest_state = 0;
67module_param(emulate_invalid_guest_state, bool, S_IRUGO); 68module_param(emulate_invalid_guest_state, bool, S_IRUGO);
68 69
69static int __read_mostly vmm_exclusive = 1; 70static bool __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 71module_param(vmm_exclusive, bool, S_IRUGO);
71 72
72static int __read_mostly yield_on_hlt = 1; 73static bool __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO); 74module_param(yield_on_hlt, bool, S_IRUGO);
74 75
75static int __read_mostly fasteoi = 1; 76static bool __read_mostly fasteoi = 1;
76module_param(fasteoi, bool, S_IRUGO); 77module_param(fasteoi, bool, S_IRUGO);
77 78
78/* 79/*
@@ -80,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO);
80 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 81 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
81 * use VMX instructions. 82 * use VMX instructions.
82 */ 83 */
83static int __read_mostly nested = 0; 84static bool __read_mostly nested = 0;
84module_param(nested, bool, S_IRUGO); 85module_param(nested, bool, S_IRUGO);
85 86
86#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 87#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
@@ -1747,7 +1748,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1747 int save_nmsrs, index; 1748 int save_nmsrs, index;
1748 unsigned long *msr_bitmap; 1749 unsigned long *msr_bitmap;
1749 1750
1750 vmx_load_host_state(vmx);
1751 save_nmsrs = 0; 1751 save_nmsrs = 0;
1752#ifdef CONFIG_X86_64 1752#ifdef CONFIG_X86_64
1753 if (is_long_mode(&vmx->vcpu)) { 1753 if (is_long_mode(&vmx->vcpu)) {
@@ -1956,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
1956#endif 1956#endif
1957 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 1957 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1958 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 1958 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1959 CPU_BASED_RDPMC_EXITING |
1959 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1960 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1960 /* 1961 /*
1961 * We can allow some features even when not supported by the 1962 * We can allow some features even when not supported by the
@@ -2142,12 +2143,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2142 return 1; 2143 return 1;
2143 /* Otherwise falls through */ 2144 /* Otherwise falls through */
2144 default: 2145 default:
2145 vmx_load_host_state(to_vmx(vcpu));
2146 if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) 2146 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2147 return 0; 2147 return 0;
2148 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2148 msr = find_msr_entry(to_vmx(vcpu), msr_index);
2149 if (msr) { 2149 if (msr) {
2150 vmx_load_host_state(to_vmx(vcpu));
2151 data = msr->data; 2150 data = msr->data;
2152 break; 2151 break;
2153 } 2152 }
@@ -2171,7 +2170,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2171 2170
2172 switch (msr_index) { 2171 switch (msr_index) {
2173 case MSR_EFER: 2172 case MSR_EFER:
2174 vmx_load_host_state(vmx);
2175 ret = kvm_set_msr_common(vcpu, msr_index, data); 2173 ret = kvm_set_msr_common(vcpu, msr_index, data);
2176 break; 2174 break;
2177#ifdef CONFIG_X86_64 2175#ifdef CONFIG_X86_64
@@ -2220,7 +2218,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2220 break; 2218 break;
2221 msr = find_msr_entry(vmx, msr_index); 2219 msr = find_msr_entry(vmx, msr_index);
2222 if (msr) { 2220 if (msr) {
2223 vmx_load_host_state(vmx);
2224 msr->data = data; 2221 msr->data = data;
2225 break; 2222 break;
2226 } 2223 }
@@ -2414,7 +2411,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2414 CPU_BASED_USE_TSC_OFFSETING | 2411 CPU_BASED_USE_TSC_OFFSETING |
2415 CPU_BASED_MWAIT_EXITING | 2412 CPU_BASED_MWAIT_EXITING |
2416 CPU_BASED_MONITOR_EXITING | 2413 CPU_BASED_MONITOR_EXITING |
2417 CPU_BASED_INVLPG_EXITING; 2414 CPU_BASED_INVLPG_EXITING |
2415 CPU_BASED_RDPMC_EXITING;
2418 2416
2419 if (yield_on_hlt) 2417 if (yield_on_hlt)
2420 min |= CPU_BASED_HLT_EXITING; 2418 min |= CPU_BASED_HLT_EXITING;
@@ -2716,11 +2714,13 @@ static gva_t rmode_tss_base(struct kvm *kvm)
2716{ 2714{
2717 if (!kvm->arch.tss_addr) { 2715 if (!kvm->arch.tss_addr) {
2718 struct kvm_memslots *slots; 2716 struct kvm_memslots *slots;
2717 struct kvm_memory_slot *slot;
2719 gfn_t base_gfn; 2718 gfn_t base_gfn;
2720 2719
2721 slots = kvm_memslots(kvm); 2720 slots = kvm_memslots(kvm);
2722 base_gfn = slots->memslots[0].base_gfn + 2721 slot = id_to_memslot(slots, 0);
2723 kvm->memslots->memslots[0].npages - 3; 2722 base_gfn = slot->base_gfn + slot->npages - 3;
2723
2724 return base_gfn << PAGE_SHIFT; 2724 return base_gfn << PAGE_SHIFT;
2725 } 2725 }
2726 return kvm->arch.tss_addr; 2726 return kvm->arch.tss_addr;
@@ -3945,12 +3945,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
3945static void enable_irq_window(struct kvm_vcpu *vcpu) 3945static void enable_irq_window(struct kvm_vcpu *vcpu)
3946{ 3946{
3947 u32 cpu_based_vm_exec_control; 3947 u32 cpu_based_vm_exec_control;
3948 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 3948 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
3949 /* We can get here when nested_run_pending caused 3949 /*
3950 * vmx_interrupt_allowed() to return false. In this case, do 3950 * We get here if vmx_interrupt_allowed() said we can't
3951 * nothing - the interrupt will be injected later. 3951 * inject to L1 now because L2 must run. Ask L2 to exit
3952 * right after entry, so we can inject to L1 more promptly.
3952 */ 3953 */
3954 kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
3953 return; 3955 return;
3956 }
3954 3957
3955 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 3958 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3956 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 3959 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -4077,11 +4080,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4077static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4080static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4078{ 4081{
4079 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { 4082 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
4080 struct vmcs12 *vmcs12; 4083 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4081 if (to_vmx(vcpu)->nested.nested_run_pending) 4084 if (to_vmx(vcpu)->nested.nested_run_pending ||
4085 (vmcs12->idt_vectoring_info_field &
4086 VECTORING_INFO_VALID_MASK))
4082 return 0; 4087 return 0;
4083 nested_vmx_vmexit(vcpu); 4088 nested_vmx_vmexit(vcpu);
4084 vmcs12 = get_vmcs12(vcpu);
4085 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; 4089 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
4086 vmcs12->vm_exit_intr_info = 0; 4090 vmcs12->vm_exit_intr_info = 0;
4087 /* fall through to normal code, but now in L1, not L2 */ 4091 /* fall through to normal code, but now in L1, not L2 */
@@ -4611,6 +4615,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
4611 return 1; 4615 return 1;
4612} 4616}
4613 4617
4618static int handle_rdpmc(struct kvm_vcpu *vcpu)
4619{
4620 int err;
4621
4622 err = kvm_rdpmc(vcpu);
4623 kvm_complete_insn_gp(vcpu, err);
4624
4625 return 1;
4626}
4627
4614static int handle_wbinvd(struct kvm_vcpu *vcpu) 4628static int handle_wbinvd(struct kvm_vcpu *vcpu)
4615{ 4629{
4616 skip_emulated_instruction(vcpu); 4630 skip_emulated_instruction(vcpu);
@@ -5561,6 +5575,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5561 [EXIT_REASON_HLT] = handle_halt, 5575 [EXIT_REASON_HLT] = handle_halt,
5562 [EXIT_REASON_INVD] = handle_invd, 5576 [EXIT_REASON_INVD] = handle_invd,
5563 [EXIT_REASON_INVLPG] = handle_invlpg, 5577 [EXIT_REASON_INVLPG] = handle_invlpg,
5578 [EXIT_REASON_RDPMC] = handle_rdpmc,
5564 [EXIT_REASON_VMCALL] = handle_vmcall, 5579 [EXIT_REASON_VMCALL] = handle_vmcall,
5565 [EXIT_REASON_VMCLEAR] = handle_vmclear, 5580 [EXIT_REASON_VMCLEAR] = handle_vmclear,
5566 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 5581 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c38efd7b792e..14d6cadc4ba6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -26,6 +26,7 @@
26#include "tss.h" 26#include "tss.h"
27#include "kvm_cache_regs.h" 27#include "kvm_cache_regs.h"
28#include "x86.h" 28#include "x86.h"
29#include "cpuid.h"
29 30
30#include <linux/clocksource.h> 31#include <linux/clocksource.h>
31#include <linux/interrupt.h> 32#include <linux/interrupt.h>
@@ -82,15 +83,13 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
82#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 83#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
83 84
84static void update_cr8_intercept(struct kvm_vcpu *vcpu); 85static void update_cr8_intercept(struct kvm_vcpu *vcpu);
85static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
86 struct kvm_cpuid_entry2 __user *entries);
87static void process_nmi(struct kvm_vcpu *vcpu); 86static void process_nmi(struct kvm_vcpu *vcpu);
88 87
89struct kvm_x86_ops *kvm_x86_ops; 88struct kvm_x86_ops *kvm_x86_ops;
90EXPORT_SYMBOL_GPL(kvm_x86_ops); 89EXPORT_SYMBOL_GPL(kvm_x86_ops);
91 90
92int ignore_msrs = 0; 91static bool ignore_msrs = 0;
93module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 92module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
94 93
95bool kvm_has_tsc_control; 94bool kvm_has_tsc_control;
96EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 95EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
@@ -574,58 +573,6 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
574} 573}
575EXPORT_SYMBOL_GPL(kvm_set_xcr); 574EXPORT_SYMBOL_GPL(kvm_set_xcr);
576 575
577static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
578{
579 struct kvm_cpuid_entry2 *best;
580
581 best = kvm_find_cpuid_entry(vcpu, 1, 0);
582 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
583}
584
585static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
586{
587 struct kvm_cpuid_entry2 *best;
588
589 best = kvm_find_cpuid_entry(vcpu, 7, 0);
590 return best && (best->ebx & bit(X86_FEATURE_SMEP));
591}
592
593static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
594{
595 struct kvm_cpuid_entry2 *best;
596
597 best = kvm_find_cpuid_entry(vcpu, 7, 0);
598 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
599}
600
601static void update_cpuid(struct kvm_vcpu *vcpu)
602{
603 struct kvm_cpuid_entry2 *best;
604 struct kvm_lapic *apic = vcpu->arch.apic;
605 u32 timer_mode_mask;
606
607 best = kvm_find_cpuid_entry(vcpu, 1, 0);
608 if (!best)
609 return;
610
611 /* Update OSXSAVE bit */
612 if (cpu_has_xsave && best->function == 0x1) {
613 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
614 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
615 best->ecx |= bit(X86_FEATURE_OSXSAVE);
616 }
617
618 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
619 best->function == 0x1) {
620 best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
621 timer_mode_mask = 3 << 17;
622 } else
623 timer_mode_mask = 1 << 17;
624
625 if (apic)
626 apic->lapic_timer.timer_mode_mask = timer_mode_mask;
627}
628
629int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 576int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
630{ 577{
631 unsigned long old_cr4 = kvm_read_cr4(vcpu); 578 unsigned long old_cr4 = kvm_read_cr4(vcpu);
@@ -659,7 +606,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
659 kvm_mmu_reset_context(vcpu); 606 kvm_mmu_reset_context(vcpu);
660 607
661 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 608 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
662 update_cpuid(vcpu); 609 kvm_update_cpuid(vcpu);
663 610
664 return 0; 611 return 0;
665} 612}
@@ -813,6 +760,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
813} 760}
814EXPORT_SYMBOL_GPL(kvm_get_dr); 761EXPORT_SYMBOL_GPL(kvm_get_dr);
815 762
763bool kvm_rdpmc(struct kvm_vcpu *vcpu)
764{
765 u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
766 u64 data;
767 int err;
768
769 err = kvm_pmu_read_pmc(vcpu, ecx, &data);
770 if (err)
771 return err;
772 kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
773 kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
774 return err;
775}
776EXPORT_SYMBOL_GPL(kvm_rdpmc);
777
816/* 778/*
817 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 779 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
818 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 780 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -1362,12 +1324,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1362 if (page_num >= blob_size) 1324 if (page_num >= blob_size)
1363 goto out; 1325 goto out;
1364 r = -ENOMEM; 1326 r = -ENOMEM;
1365 page = kzalloc(PAGE_SIZE, GFP_KERNEL); 1327 page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
1366 if (!page) 1328 if (IS_ERR(page)) {
1329 r = PTR_ERR(page);
1367 goto out; 1330 goto out;
1368 r = -EFAULT; 1331 }
1369 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
1370 goto out_free;
1371 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 1332 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1372 goto out_free; 1333 goto out_free;
1373 r = 0; 1334 r = 0;
@@ -1656,8 +1617,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1656 * which we perfectly emulate ;-). Any other value should be at least 1617 * which we perfectly emulate ;-). Any other value should be at least
1657 * reported, some guests depend on them. 1618 * reported, some guests depend on them.
1658 */ 1619 */
1659 case MSR_P6_EVNTSEL0:
1660 case MSR_P6_EVNTSEL1:
1661 case MSR_K7_EVNTSEL0: 1620 case MSR_K7_EVNTSEL0:
1662 case MSR_K7_EVNTSEL1: 1621 case MSR_K7_EVNTSEL1:
1663 case MSR_K7_EVNTSEL2: 1622 case MSR_K7_EVNTSEL2:
@@ -1669,8 +1628,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1669 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1628 /* at least RHEL 4 unconditionally writes to the perfctr registers,
1670 * so we ignore writes to make it happy. 1629 * so we ignore writes to make it happy.
1671 */ 1630 */
1672 case MSR_P6_PERFCTR0:
1673 case MSR_P6_PERFCTR1:
1674 case MSR_K7_PERFCTR0: 1631 case MSR_K7_PERFCTR0:
1675 case MSR_K7_PERFCTR1: 1632 case MSR_K7_PERFCTR1:
1676 case MSR_K7_PERFCTR2: 1633 case MSR_K7_PERFCTR2:
@@ -1707,6 +1664,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1707 default: 1664 default:
1708 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1665 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1709 return xen_hvm_config(vcpu, data); 1666 return xen_hvm_config(vcpu, data);
1667 if (kvm_pmu_msr(vcpu, msr))
1668 return kvm_pmu_set_msr(vcpu, msr, data);
1710 if (!ignore_msrs) { 1669 if (!ignore_msrs) {
1711 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1670 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1712 msr, data); 1671 msr, data);
@@ -1869,10 +1828,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1869 case MSR_K8_SYSCFG: 1828 case MSR_K8_SYSCFG:
1870 case MSR_K7_HWCR: 1829 case MSR_K7_HWCR:
1871 case MSR_VM_HSAVE_PA: 1830 case MSR_VM_HSAVE_PA:
1872 case MSR_P6_PERFCTR0:
1873 case MSR_P6_PERFCTR1:
1874 case MSR_P6_EVNTSEL0:
1875 case MSR_P6_EVNTSEL1:
1876 case MSR_K7_EVNTSEL0: 1831 case MSR_K7_EVNTSEL0:
1877 case MSR_K7_PERFCTR0: 1832 case MSR_K7_PERFCTR0:
1878 case MSR_K8_INT_PENDING_MSG: 1833 case MSR_K8_INT_PENDING_MSG:
@@ -1983,6 +1938,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1983 data = 0xbe702111; 1938 data = 0xbe702111;
1984 break; 1939 break;
1985 default: 1940 default:
1941 if (kvm_pmu_msr(vcpu, msr))
1942 return kvm_pmu_get_msr(vcpu, msr, pdata);
1986 if (!ignore_msrs) { 1943 if (!ignore_msrs) {
1987 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1944 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1988 return 1; 1945 return 1;
@@ -2041,15 +1998,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2041 if (msrs.nmsrs >= MAX_IO_MSRS) 1998 if (msrs.nmsrs >= MAX_IO_MSRS)
2042 goto out; 1999 goto out;
2043 2000
2044 r = -ENOMEM;
2045 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 2001 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2046 entries = kmalloc(size, GFP_KERNEL); 2002 entries = memdup_user(user_msrs->entries, size);
2047 if (!entries) 2003 if (IS_ERR(entries)) {
2004 r = PTR_ERR(entries);
2048 goto out; 2005 goto out;
2049 2006 }
2050 r = -EFAULT;
2051 if (copy_from_user(entries, user_msrs->entries, size))
2052 goto out_free;
2053 2007
2054 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 2008 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2055 if (r < 0) 2009 if (r < 0)
@@ -2135,6 +2089,9 @@ int kvm_dev_ioctl_check_extension(long ext)
2135 case KVM_CAP_TSC_CONTROL: 2089 case KVM_CAP_TSC_CONTROL:
2136 r = kvm_has_tsc_control; 2090 r = kvm_has_tsc_control;
2137 break; 2091 break;
2092 case KVM_CAP_TSC_DEADLINE_TIMER:
2093 r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2094 break;
2138 default: 2095 default:
2139 r = 0; 2096 r = 0;
2140 break; 2097 break;
@@ -2266,466 +2223,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2266 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2223 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
2267} 2224}
2268 2225
2269static int is_efer_nx(void)
2270{
2271 unsigned long long efer = 0;
2272
2273 rdmsrl_safe(MSR_EFER, &efer);
2274 return efer & EFER_NX;
2275}
2276
2277static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2278{
2279 int i;
2280 struct kvm_cpuid_entry2 *e, *entry;
2281
2282 entry = NULL;
2283 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2284 e = &vcpu->arch.cpuid_entries[i];
2285 if (e->function == 0x80000001) {
2286 entry = e;
2287 break;
2288 }
2289 }
2290 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
2291 entry->edx &= ~(1 << 20);
2292 printk(KERN_INFO "kvm: guest NX capability removed\n");
2293 }
2294}
2295
2296/* when an old userspace process fills a new kernel module */
2297static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2298 struct kvm_cpuid *cpuid,
2299 struct kvm_cpuid_entry __user *entries)
2300{
2301 int r, i;
2302 struct kvm_cpuid_entry *cpuid_entries;
2303
2304 r = -E2BIG;
2305 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2306 goto out;
2307 r = -ENOMEM;
2308 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
2309 if (!cpuid_entries)
2310 goto out;
2311 r = -EFAULT;
2312 if (copy_from_user(cpuid_entries, entries,
2313 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2314 goto out_free;
2315 for (i = 0; i < cpuid->nent; i++) {
2316 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
2317 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
2318 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
2319 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
2320 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
2321 vcpu->arch.cpuid_entries[i].index = 0;
2322 vcpu->arch.cpuid_entries[i].flags = 0;
2323 vcpu->arch.cpuid_entries[i].padding[0] = 0;
2324 vcpu->arch.cpuid_entries[i].padding[1] = 0;
2325 vcpu->arch.cpuid_entries[i].padding[2] = 0;
2326 }
2327 vcpu->arch.cpuid_nent = cpuid->nent;
2328 cpuid_fix_nx_cap(vcpu);
2329 r = 0;
2330 kvm_apic_set_version(vcpu);
2331 kvm_x86_ops->cpuid_update(vcpu);
2332 update_cpuid(vcpu);
2333
2334out_free:
2335 vfree(cpuid_entries);
2336out:
2337 return r;
2338}
2339
2340static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
2341 struct kvm_cpuid2 *cpuid,
2342 struct kvm_cpuid_entry2 __user *entries)
2343{
2344 int r;
2345
2346 r = -E2BIG;
2347 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2348 goto out;
2349 r = -EFAULT;
2350 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
2351 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
2352 goto out;
2353 vcpu->arch.cpuid_nent = cpuid->nent;
2354 kvm_apic_set_version(vcpu);
2355 kvm_x86_ops->cpuid_update(vcpu);
2356 update_cpuid(vcpu);
2357 return 0;
2358
2359out:
2360 return r;
2361}
2362
2363static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
2364 struct kvm_cpuid2 *cpuid,
2365 struct kvm_cpuid_entry2 __user *entries)
2366{
2367 int r;
2368
2369 r = -E2BIG;
2370 if (cpuid->nent < vcpu->arch.cpuid_nent)
2371 goto out;
2372 r = -EFAULT;
2373 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
2374 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
2375 goto out;
2376 return 0;
2377
2378out:
2379 cpuid->nent = vcpu->arch.cpuid_nent;
2380 return r;
2381}
2382
2383static void cpuid_mask(u32 *word, int wordnum)
2384{
2385 *word &= boot_cpu_data.x86_capability[wordnum];
2386}
2387
2388static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2389 u32 index)
2390{
2391 entry->function = function;
2392 entry->index = index;
2393 cpuid_count(entry->function, entry->index,
2394 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
2395 entry->flags = 0;
2396}
2397
2398static bool supported_xcr0_bit(unsigned bit)
2399{
2400 u64 mask = ((u64)1 << bit);
2401
2402 return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
2403}
2404
2405#define F(x) bit(X86_FEATURE_##x)
2406
2407static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2408 u32 index, int *nent, int maxnent)
2409{
2410 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
2411#ifdef CONFIG_X86_64
2412 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
2413 ? F(GBPAGES) : 0;
2414 unsigned f_lm = F(LM);
2415#else
2416 unsigned f_gbpages = 0;
2417 unsigned f_lm = 0;
2418#endif
2419 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
2420
2421 /* cpuid 1.edx */
2422 const u32 kvm_supported_word0_x86_features =
2423 F(FPU) | F(VME) | F(DE) | F(PSE) |
2424 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
2425 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
2426 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
2427 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
2428 0 /* Reserved, DS, ACPI */ | F(MMX) |
2429 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
2430 0 /* HTT, TM, Reserved, PBE */;
2431 /* cpuid 0x80000001.edx */
2432 const u32 kvm_supported_word1_x86_features =
2433 F(FPU) | F(VME) | F(DE) | F(PSE) |
2434 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
2435 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
2436 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
2437 F(PAT) | F(PSE36) | 0 /* Reserved */ |
2438 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
2439 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
2440 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
2441 /* cpuid 1.ecx */
2442 const u32 kvm_supported_word4_x86_features =
2443 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
2444 0 /* DS-CPL, VMX, SMX, EST */ |
2445 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
2446 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
2447 0 /* Reserved, DCA */ | F(XMM4_1) |
2448 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
2449 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2450 F(F16C) | F(RDRAND);
2451 /* cpuid 0x80000001.ecx */
2452 const u32 kvm_supported_word6_x86_features =
2453 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
2454 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
2455 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2456 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2457
2458 /* cpuid 0xC0000001.edx */
2459 const u32 kvm_supported_word5_x86_features =
2460 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2461 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2462 F(PMM) | F(PMM_EN);
2463
2464 /* cpuid 7.0.ebx */
2465 const u32 kvm_supported_word9_x86_features =
2466 F(SMEP) | F(FSGSBASE) | F(ERMS);
2467
2468 /* all calls to cpuid_count() should be made on the same cpu */
2469 get_cpu();
2470 do_cpuid_1_ent(entry, function, index);
2471 ++*nent;
2472
2473 switch (function) {
2474 case 0:
2475 entry->eax = min(entry->eax, (u32)0xd);
2476 break;
2477 case 1:
2478 entry->edx &= kvm_supported_word0_x86_features;
2479 cpuid_mask(&entry->edx, 0);
2480 entry->ecx &= kvm_supported_word4_x86_features;
2481 cpuid_mask(&entry->ecx, 4);
2482 /* we support x2apic emulation even if host does not support
2483 * it since we emulate x2apic in software */
2484 entry->ecx |= F(X2APIC);
2485 break;
2486 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
2487 * may return different values. This forces us to get_cpu() before
2488 * issuing the first command, and also to emulate this annoying behavior
2489 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
2490 case 2: {
2491 int t, times = entry->eax & 0xff;
2492
2493 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
2494 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2495 for (t = 1; t < times && *nent < maxnent; ++t) {
2496 do_cpuid_1_ent(&entry[t], function, 0);
2497 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
2498 ++*nent;
2499 }
2500 break;
2501 }
2502 /* function 4 has additional index. */
2503 case 4: {
2504 int i, cache_type;
2505
2506 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2507 /* read more entries until cache_type is zero */
2508 for (i = 1; *nent < maxnent; ++i) {
2509 cache_type = entry[i - 1].eax & 0x1f;
2510 if (!cache_type)
2511 break;
2512 do_cpuid_1_ent(&entry[i], function, i);
2513 entry[i].flags |=
2514 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2515 ++*nent;
2516 }
2517 break;
2518 }
2519 case 7: {
2520 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2521 /* Mask ebx against host capbability word 9 */
2522 if (index == 0) {
2523 entry->ebx &= kvm_supported_word9_x86_features;
2524 cpuid_mask(&entry->ebx, 9);
2525 } else
2526 entry->ebx = 0;
2527 entry->eax = 0;
2528 entry->ecx = 0;
2529 entry->edx = 0;
2530 break;
2531 }
2532 case 9:
2533 break;
2534 /* function 0xb has additional index. */
2535 case 0xb: {
2536 int i, level_type;
2537
2538 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2539 /* read more entries until level_type is zero */
2540 for (i = 1; *nent < maxnent; ++i) {
2541 level_type = entry[i - 1].ecx & 0xff00;
2542 if (!level_type)
2543 break;
2544 do_cpuid_1_ent(&entry[i], function, i);
2545 entry[i].flags |=
2546 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2547 ++*nent;
2548 }
2549 break;
2550 }
2551 case 0xd: {
2552 int idx, i;
2553
2554 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2555 for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
2556 do_cpuid_1_ent(&entry[i], function, idx);
2557 if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
2558 continue;
2559 entry[i].flags |=
2560 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2561 ++*nent;
2562 ++i;
2563 }
2564 break;
2565 }
2566 case KVM_CPUID_SIGNATURE: {
2567 char signature[12] = "KVMKVMKVM\0\0";
2568 u32 *sigptr = (u32 *)signature;
2569 entry->eax = 0;
2570 entry->ebx = sigptr[0];
2571 entry->ecx = sigptr[1];
2572 entry->edx = sigptr[2];
2573 break;
2574 }
2575 case KVM_CPUID_FEATURES:
2576 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
2577 (1 << KVM_FEATURE_NOP_IO_DELAY) |
2578 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2579 (1 << KVM_FEATURE_ASYNC_PF) |
2580 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2581
2582 if (sched_info_on())
2583 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
2584
2585 entry->ebx = 0;
2586 entry->ecx = 0;
2587 entry->edx = 0;
2588 break;
2589 case 0x80000000:
2590 entry->eax = min(entry->eax, 0x8000001a);
2591 break;
2592 case 0x80000001:
2593 entry->edx &= kvm_supported_word1_x86_features;
2594 cpuid_mask(&entry->edx, 1);
2595 entry->ecx &= kvm_supported_word6_x86_features;
2596 cpuid_mask(&entry->ecx, 6);
2597 break;
2598 case 0x80000008: {
2599 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
2600 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
2601 unsigned phys_as = entry->eax & 0xff;
2602
2603 if (!g_phys_as)
2604 g_phys_as = phys_as;
2605 entry->eax = g_phys_as | (virt_as << 8);
2606 entry->ebx = entry->edx = 0;
2607 break;
2608 }
2609 case 0x80000019:
2610 entry->ecx = entry->edx = 0;
2611 break;
2612 case 0x8000001a:
2613 break;
2614 case 0x8000001d:
2615 break;
2616 /*Add support for Centaur's CPUID instruction*/
2617 case 0xC0000000:
2618 /*Just support up to 0xC0000004 now*/
2619 entry->eax = min(entry->eax, 0xC0000004);
2620 break;
2621 case 0xC0000001:
2622 entry->edx &= kvm_supported_word5_x86_features;
2623 cpuid_mask(&entry->edx, 5);
2624 break;
2625 case 3: /* Processor serial number */
2626 case 5: /* MONITOR/MWAIT */
2627 case 6: /* Thermal management */
2628 case 0xA: /* Architectural Performance Monitoring */
2629 case 0x80000007: /* Advanced power management */
2630 case 0xC0000002:
2631 case 0xC0000003:
2632 case 0xC0000004:
2633 default:
2634 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
2635 break;
2636 }
2637
2638 kvm_x86_ops->set_supported_cpuid(function, entry);
2639
2640 put_cpu();
2641}
2642
2643#undef F
2644
2645static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2646 struct kvm_cpuid_entry2 __user *entries)
2647{
2648 struct kvm_cpuid_entry2 *cpuid_entries;
2649 int limit, nent = 0, r = -E2BIG;
2650 u32 func;
2651
2652 if (cpuid->nent < 1)
2653 goto out;
2654 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2655 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
2656 r = -ENOMEM;
2657 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
2658 if (!cpuid_entries)
2659 goto out;
2660
2661 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
2662 limit = cpuid_entries[0].eax;
2663 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
2664 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2665 &nent, cpuid->nent);
2666 r = -E2BIG;
2667 if (nent >= cpuid->nent)
2668 goto out_free;
2669
2670 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
2671 limit = cpuid_entries[nent - 1].eax;
2672 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
2673 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2674 &nent, cpuid->nent);
2675
2676
2677
2678 r = -E2BIG;
2679 if (nent >= cpuid->nent)
2680 goto out_free;
2681
2682 /* Add support for Centaur's CPUID instruction. */
2683 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
2684 do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
2685 &nent, cpuid->nent);
2686
2687 r = -E2BIG;
2688 if (nent >= cpuid->nent)
2689 goto out_free;
2690
2691 limit = cpuid_entries[nent - 1].eax;
2692 for (func = 0xC0000001;
2693 func <= limit && nent < cpuid->nent; ++func)
2694 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2695 &nent, cpuid->nent);
2696
2697 r = -E2BIG;
2698 if (nent >= cpuid->nent)
2699 goto out_free;
2700 }
2701
2702 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2703 cpuid->nent);
2704
2705 r = -E2BIG;
2706 if (nent >= cpuid->nent)
2707 goto out_free;
2708
2709 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
2710 cpuid->nent);
2711
2712 r = -E2BIG;
2713 if (nent >= cpuid->nent)
2714 goto out_free;
2715
2716 r = -EFAULT;
2717 if (copy_to_user(entries, cpuid_entries,
2718 nent * sizeof(struct kvm_cpuid_entry2)))
2719 goto out_free;
2720 cpuid->nent = nent;
2721 r = 0;
2722
2723out_free:
2724 vfree(cpuid_entries);
2725out:
2726 return r;
2727}
2728
2729static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2226static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2730 struct kvm_lapic_state *s) 2227 struct kvm_lapic_state *s)
2731{ 2228{
@@ -3043,13 +2540,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3043 r = -EINVAL; 2540 r = -EINVAL;
3044 if (!vcpu->arch.apic) 2541 if (!vcpu->arch.apic)
3045 goto out; 2542 goto out;
3046 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2543 u.lapic = memdup_user(argp, sizeof(*u.lapic));
3047 r = -ENOMEM; 2544 if (IS_ERR(u.lapic)) {
3048 if (!u.lapic) 2545 r = PTR_ERR(u.lapic);
3049 goto out;
3050 r = -EFAULT;
3051 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
3052 goto out; 2546 goto out;
2547 }
2548
3053 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 2549 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3054 if (r) 2550 if (r)
3055 goto out; 2551 goto out;
@@ -3228,14 +2724,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3228 break; 2724 break;
3229 } 2725 }
3230 case KVM_SET_XSAVE: { 2726 case KVM_SET_XSAVE: {
3231 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 2727 u.xsave = memdup_user(argp, sizeof(*u.xsave));
3232 r = -ENOMEM; 2728 if (IS_ERR(u.xsave)) {
3233 if (!u.xsave) 2729 r = PTR_ERR(u.xsave);
3234 break; 2730 goto out;
3235 2731 }
3236 r = -EFAULT;
3237 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
3238 break;
3239 2732
3240 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 2733 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
3241 break; 2734 break;
@@ -3256,15 +2749,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3256 break; 2749 break;
3257 } 2750 }
3258 case KVM_SET_XCRS: { 2751 case KVM_SET_XCRS: {
3259 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 2752 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
3260 r = -ENOMEM; 2753 if (IS_ERR(u.xcrs)) {
3261 if (!u.xcrs) 2754 r = PTR_ERR(u.xcrs);
3262 break; 2755 goto out;
3263 2756 }
3264 r = -EFAULT;
3265 if (copy_from_user(u.xcrs, argp,
3266 sizeof(struct kvm_xcrs)))
3267 break;
3268 2757
3269 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 2758 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3270 break; 2759 break;
@@ -3461,16 +2950,59 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3461 return 0; 2950 return 0;
3462} 2951}
3463 2952
2953/**
2954 * write_protect_slot - write protect a slot for dirty logging
2955 * @kvm: the kvm instance
2956 * @memslot: the slot we protect
2957 * @dirty_bitmap: the bitmap indicating which pages are dirty
2958 * @nr_dirty_pages: the number of dirty pages
2959 *
2960 * We have two ways to find all sptes to protect:
2961 * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
2962 * checks ones that have a spte mapping a page in the slot.
2963 * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
2964 *
2965 * Generally speaking, if there are not so many dirty pages compared to the
2966 * number of shadow pages, we should use the latter.
2967 *
2968 * Note that letting others write into a page marked dirty in the old bitmap
2969 * by using the remaining tlb entry is not a problem. That page will become
2970 * write protected again when we flush the tlb and then be reported dirty to
2971 * the user space by copying the old bitmap.
2972 */
2973static void write_protect_slot(struct kvm *kvm,
2974 struct kvm_memory_slot *memslot,
2975 unsigned long *dirty_bitmap,
2976 unsigned long nr_dirty_pages)
2977{
2978 /* Not many dirty pages compared to # of shadow pages. */
2979 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
2980 unsigned long gfn_offset;
2981
2982 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
2983 unsigned long gfn = memslot->base_gfn + gfn_offset;
2984
2985 spin_lock(&kvm->mmu_lock);
2986 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
2987 spin_unlock(&kvm->mmu_lock);
2988 }
2989 kvm_flush_remote_tlbs(kvm);
2990 } else {
2991 spin_lock(&kvm->mmu_lock);
2992 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
2993 spin_unlock(&kvm->mmu_lock);
2994 }
2995}
2996
3464/* 2997/*
3465 * Get (and clear) the dirty memory log for a memory slot. 2998 * Get (and clear) the dirty memory log for a memory slot.
3466 */ 2999 */
3467int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 3000int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3468 struct kvm_dirty_log *log) 3001 struct kvm_dirty_log *log)
3469{ 3002{
3470 int r, i; 3003 int r;
3471 struct kvm_memory_slot *memslot; 3004 struct kvm_memory_slot *memslot;
3472 unsigned long n; 3005 unsigned long n, nr_dirty_pages;
3473 unsigned long is_dirty = 0;
3474 3006
3475 mutex_lock(&kvm->slots_lock); 3007 mutex_lock(&kvm->slots_lock);
3476 3008
@@ -3478,43 +3010,41 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3478 if (log->slot >= KVM_MEMORY_SLOTS) 3010 if (log->slot >= KVM_MEMORY_SLOTS)
3479 goto out; 3011 goto out;
3480 3012
3481 memslot = &kvm->memslots->memslots[log->slot]; 3013 memslot = id_to_memslot(kvm->memslots, log->slot);
3482 r = -ENOENT; 3014 r = -ENOENT;
3483 if (!memslot->dirty_bitmap) 3015 if (!memslot->dirty_bitmap)
3484 goto out; 3016 goto out;
3485 3017
3486 n = kvm_dirty_bitmap_bytes(memslot); 3018 n = kvm_dirty_bitmap_bytes(memslot);
3487 3019 nr_dirty_pages = memslot->nr_dirty_pages;
3488 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
3489 is_dirty = memslot->dirty_bitmap[i];
3490 3020
3491 /* If nothing is dirty, don't bother messing with page tables. */ 3021 /* If nothing is dirty, don't bother messing with page tables. */
3492 if (is_dirty) { 3022 if (nr_dirty_pages) {
3493 struct kvm_memslots *slots, *old_slots; 3023 struct kvm_memslots *slots, *old_slots;
3494 unsigned long *dirty_bitmap; 3024 unsigned long *dirty_bitmap, *dirty_bitmap_head;
3495 3025
3496 dirty_bitmap = memslot->dirty_bitmap_head; 3026 dirty_bitmap = memslot->dirty_bitmap;
3497 if (memslot->dirty_bitmap == dirty_bitmap) 3027 dirty_bitmap_head = memslot->dirty_bitmap_head;
3498 dirty_bitmap += n / sizeof(long); 3028 if (dirty_bitmap == dirty_bitmap_head)
3499 memset(dirty_bitmap, 0, n); 3029 dirty_bitmap_head += n / sizeof(long);
3030 memset(dirty_bitmap_head, 0, n);
3500 3031
3501 r = -ENOMEM; 3032 r = -ENOMEM;
3502 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3033 slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
3503 if (!slots) 3034 if (!slots)
3504 goto out; 3035 goto out;
3505 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3036
3506 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3037 memslot = id_to_memslot(slots, log->slot);
3507 slots->generation++; 3038 memslot->nr_dirty_pages = 0;
3039 memslot->dirty_bitmap = dirty_bitmap_head;
3040 update_memslots(slots, NULL);
3508 3041
3509 old_slots = kvm->memslots; 3042 old_slots = kvm->memslots;
3510 rcu_assign_pointer(kvm->memslots, slots); 3043 rcu_assign_pointer(kvm->memslots, slots);
3511 synchronize_srcu_expedited(&kvm->srcu); 3044 synchronize_srcu_expedited(&kvm->srcu);
3512 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
3513 kfree(old_slots); 3045 kfree(old_slots);
3514 3046
3515 spin_lock(&kvm->mmu_lock); 3047 write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
3516 kvm_mmu_slot_remove_write_access(kvm, log->slot);
3517 spin_unlock(&kvm->mmu_lock);
3518 3048
3519 r = -EFAULT; 3049 r = -EFAULT;
3520 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) 3050 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
@@ -3659,14 +3189,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
3659 } 3189 }
3660 case KVM_GET_IRQCHIP: { 3190 case KVM_GET_IRQCHIP: {
3661 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3191 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3662 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 3192 struct kvm_irqchip *chip;
3663 3193
3664 r = -ENOMEM; 3194 chip = memdup_user(argp, sizeof(*chip));
3665 if (!chip) 3195 if (IS_ERR(chip)) {
3196 r = PTR_ERR(chip);
3666 goto out; 3197 goto out;
3667 r = -EFAULT; 3198 }
3668 if (copy_from_user(chip, argp, sizeof *chip)) 3199
3669 goto get_irqchip_out;
3670 r = -ENXIO; 3200 r = -ENXIO;
3671 if (!irqchip_in_kernel(kvm)) 3201 if (!irqchip_in_kernel(kvm))
3672 goto get_irqchip_out; 3202 goto get_irqchip_out;
@@ -3685,14 +3215,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
3685 } 3215 }
3686 case KVM_SET_IRQCHIP: { 3216 case KVM_SET_IRQCHIP: {
3687 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3217 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3688 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 3218 struct kvm_irqchip *chip;
3689 3219
3690 r = -ENOMEM; 3220 chip = memdup_user(argp, sizeof(*chip));
3691 if (!chip) 3221 if (IS_ERR(chip)) {
3222 r = PTR_ERR(chip);
3692 goto out; 3223 goto out;
3693 r = -EFAULT; 3224 }
3694 if (copy_from_user(chip, argp, sizeof *chip)) 3225
3695 goto set_irqchip_out;
3696 r = -ENXIO; 3226 r = -ENXIO;
3697 if (!irqchip_in_kernel(kvm)) 3227 if (!irqchip_in_kernel(kvm))
3698 goto set_irqchip_out; 3228 goto set_irqchip_out;
@@ -3899,12 +3429,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
3899 kvm_x86_ops->get_segment(vcpu, var, seg); 3429 kvm_x86_ops->get_segment(vcpu, var, seg);
3900} 3430}
3901 3431
3902static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3432gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3903{
3904 return gpa;
3905}
3906
3907static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3908{ 3433{
3909 gpa_t t_gpa; 3434 gpa_t t_gpa;
3910 struct x86_exception exception; 3435 struct x86_exception exception;
@@ -4088,7 +3613,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4088 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 3613 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4089 if (ret < 0) 3614 if (ret < 0)
4090 return 0; 3615 return 0;
4091 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 3616 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
4092 return 1; 3617 return 1;
4093} 3618}
4094 3619
@@ -4325,7 +3850,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4325 if (!exchanged) 3850 if (!exchanged)
4326 return X86EMUL_CMPXCHG_FAILED; 3851 return X86EMUL_CMPXCHG_FAILED;
4327 3852
4328 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); 3853 kvm_mmu_pte_write(vcpu, gpa, new, bytes);
4329 3854
4330 return X86EMUL_CONTINUE; 3855 return X86EMUL_CONTINUE;
4331 3856
@@ -4350,32 +3875,24 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4350 return r; 3875 return r;
4351} 3876}
4352 3877
4353 3878static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
4354static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, 3879 unsigned short port, void *val,
4355 int size, unsigned short port, void *val, 3880 unsigned int count, bool in)
4356 unsigned int count)
4357{ 3881{
4358 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3882 trace_kvm_pio(!in, port, size, count);
4359
4360 if (vcpu->arch.pio.count)
4361 goto data_avail;
4362
4363 trace_kvm_pio(0, port, size, count);
4364 3883
4365 vcpu->arch.pio.port = port; 3884 vcpu->arch.pio.port = port;
4366 vcpu->arch.pio.in = 1; 3885 vcpu->arch.pio.in = in;
4367 vcpu->arch.pio.count = count; 3886 vcpu->arch.pio.count = count;
4368 vcpu->arch.pio.size = size; 3887 vcpu->arch.pio.size = size;
4369 3888
4370 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3889 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
4371 data_avail:
4372 memcpy(val, vcpu->arch.pio_data, size * count);
4373 vcpu->arch.pio.count = 0; 3890 vcpu->arch.pio.count = 0;
4374 return 1; 3891 return 1;
4375 } 3892 }
4376 3893
4377 vcpu->run->exit_reason = KVM_EXIT_IO; 3894 vcpu->run->exit_reason = KVM_EXIT_IO;
4378 vcpu->run->io.direction = KVM_EXIT_IO_IN; 3895 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
4379 vcpu->run->io.size = size; 3896 vcpu->run->io.size = size;
4380 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3897 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4381 vcpu->run->io.count = count; 3898 vcpu->run->io.count = count;
@@ -4384,36 +3901,37 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4384 return 0; 3901 return 0;
4385} 3902}
4386 3903
4387static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, 3904static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4388 int size, unsigned short port, 3905 int size, unsigned short port, void *val,
4389 const void *val, unsigned int count) 3906 unsigned int count)
4390{ 3907{
4391 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3908 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3909 int ret;
4392 3910
4393 trace_kvm_pio(1, port, size, count); 3911 if (vcpu->arch.pio.count)
4394 3912 goto data_avail;
4395 vcpu->arch.pio.port = port;
4396 vcpu->arch.pio.in = 0;
4397 vcpu->arch.pio.count = count;
4398 vcpu->arch.pio.size = size;
4399
4400 memcpy(vcpu->arch.pio_data, val, size * count);
4401 3913
4402 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3914 ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
3915 if (ret) {
3916data_avail:
3917 memcpy(val, vcpu->arch.pio_data, size * count);
4403 vcpu->arch.pio.count = 0; 3918 vcpu->arch.pio.count = 0;
4404 return 1; 3919 return 1;
4405 } 3920 }
4406 3921
4407 vcpu->run->exit_reason = KVM_EXIT_IO;
4408 vcpu->run->io.direction = KVM_EXIT_IO_OUT;
4409 vcpu->run->io.size = size;
4410 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4411 vcpu->run->io.count = count;
4412 vcpu->run->io.port = port;
4413
4414 return 0; 3922 return 0;
4415} 3923}
4416 3924
3925static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
3926 int size, unsigned short port,
3927 const void *val, unsigned int count)
3928{
3929 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3930
3931 memcpy(vcpu->arch.pio_data, val, size * count);
3932 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
3933}
3934
4417static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 3935static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4418{ 3936{
4419 return kvm_x86_ops->get_segment_base(vcpu, seg); 3937 return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -4628,6 +4146,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4628 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); 4146 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
4629} 4147}
4630 4148
4149static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
4150 u32 pmc, u64 *pdata)
4151{
4152 return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
4153}
4154
4631static void emulator_halt(struct x86_emulate_ctxt *ctxt) 4155static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4632{ 4156{
4633 emul_to_vcpu(ctxt)->arch.halt_request = 1; 4157 emul_to_vcpu(ctxt)->arch.halt_request = 1;
@@ -4680,6 +4204,7 @@ static struct x86_emulate_ops emulate_ops = {
4680 .set_dr = emulator_set_dr, 4204 .set_dr = emulator_set_dr,
4681 .set_msr = emulator_set_msr, 4205 .set_msr = emulator_set_msr,
4682 .get_msr = emulator_get_msr, 4206 .get_msr = emulator_get_msr,
4207 .read_pmc = emulator_read_pmc,
4683 .halt = emulator_halt, 4208 .halt = emulator_halt,
4684 .wbinvd = emulator_wbinvd, 4209 .wbinvd = emulator_wbinvd,
4685 .fix_hypercall = emulator_fix_hypercall, 4210 .fix_hypercall = emulator_fix_hypercall,
@@ -4837,6 +4362,50 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4837 return false; 4362 return false;
4838} 4363}
4839 4364
4365static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4366 unsigned long cr2, int emulation_type)
4367{
4368 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4369 unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
4370
4371 last_retry_eip = vcpu->arch.last_retry_eip;
4372 last_retry_addr = vcpu->arch.last_retry_addr;
4373
4374 /*
4375 * If the emulation is caused by #PF and it is non-page_table
4376 * writing instruction, it means the VM-EXIT is caused by shadow
4377 * page protected, we can zap the shadow page and retry this
4378 * instruction directly.
4379 *
4380 * Note: if the guest uses a non-page-table modifying instruction
4381 * on the PDE that points to the instruction, then we will unmap
4382 * the instruction and go to an infinite loop. So, we cache the
4383 * last retried eip and the last fault address, if we meet the eip
4384 * and the address again, we can break out of the potential infinite
4385 * loop.
4386 */
4387 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
4388
4389 if (!(emulation_type & EMULTYPE_RETRY))
4390 return false;
4391
4392 if (x86_page_table_writing_insn(ctxt))
4393 return false;
4394
4395 if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
4396 return false;
4397
4398 vcpu->arch.last_retry_eip = ctxt->eip;
4399 vcpu->arch.last_retry_addr = cr2;
4400
4401 if (!vcpu->arch.mmu.direct_map)
4402 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4403
4404 kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4405
4406 return true;
4407}
4408
4840int x86_emulate_instruction(struct kvm_vcpu *vcpu, 4409int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4841 unsigned long cr2, 4410 unsigned long cr2,
4842 int emulation_type, 4411 int emulation_type,
@@ -4878,6 +4447,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4878 return EMULATE_DONE; 4447 return EMULATE_DONE;
4879 } 4448 }
4880 4449
4450 if (retry_instruction(ctxt, cr2, emulation_type))
4451 return EMULATE_DONE;
4452
4881 /* this is needed for vmware backdoor interface to work since it 4453 /* this is needed for vmware backdoor interface to work since it
4882 changes registers values during IO operation */ 4454 changes registers values during IO operation */
4883 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4455 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
@@ -5096,17 +4668,17 @@ static void kvm_timer_init(void)
5096 4668
5097static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4669static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
5098 4670
5099static int kvm_is_in_guest(void) 4671int kvm_is_in_guest(void)
5100{ 4672{
5101 return percpu_read(current_vcpu) != NULL; 4673 return __this_cpu_read(current_vcpu) != NULL;
5102} 4674}
5103 4675
5104static int kvm_is_user_mode(void) 4676static int kvm_is_user_mode(void)
5105{ 4677{
5106 int user_mode = 3; 4678 int user_mode = 3;
5107 4679
5108 if (percpu_read(current_vcpu)) 4680 if (__this_cpu_read(current_vcpu))
5109 user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); 4681 user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
5110 4682
5111 return user_mode != 0; 4683 return user_mode != 0;
5112} 4684}
@@ -5115,8 +4687,8 @@ static unsigned long kvm_get_guest_ip(void)
5115{ 4687{
5116 unsigned long ip = 0; 4688 unsigned long ip = 0;
5117 4689
5118 if (percpu_read(current_vcpu)) 4690 if (__this_cpu_read(current_vcpu))
5119 ip = kvm_rip_read(percpu_read(current_vcpu)); 4691 ip = kvm_rip_read(__this_cpu_read(current_vcpu));
5120 4692
5121 return ip; 4693 return ip;
5122} 4694}
@@ -5129,13 +4701,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
5129 4701
5130void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) 4702void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
5131{ 4703{
5132 percpu_write(current_vcpu, vcpu); 4704 __this_cpu_write(current_vcpu, vcpu);
5133} 4705}
5134EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); 4706EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
5135 4707
5136void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) 4708void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
5137{ 4709{
5138 percpu_write(current_vcpu, NULL); 4710 __this_cpu_write(current_vcpu, NULL);
5139} 4711}
5140EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); 4712EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
5141 4713
@@ -5234,15 +4806,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5234} 4806}
5235EXPORT_SYMBOL_GPL(kvm_emulate_halt); 4807EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5236 4808
5237static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
5238 unsigned long a1)
5239{
5240 if (is_long_mode(vcpu))
5241 return a0;
5242 else
5243 return a0 | ((gpa_t)a1 << 32);
5244}
5245
5246int kvm_hv_hypercall(struct kvm_vcpu *vcpu) 4809int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5247{ 4810{
5248 u64 param, ingpa, outgpa, ret; 4811 u64 param, ingpa, outgpa, ret;
@@ -5338,9 +4901,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5338 case KVM_HC_VAPIC_POLL_IRQ: 4901 case KVM_HC_VAPIC_POLL_IRQ:
5339 ret = 0; 4902 ret = 0;
5340 break; 4903 break;
5341 case KVM_HC_MMU_OP:
5342 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
5343 break;
5344 default: 4904 default:
5345 ret = -KVM_ENOSYS; 4905 ret = -KVM_ENOSYS;
5346 break; 4906 break;
@@ -5370,125 +4930,6 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5370 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); 4930 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5371} 4931}
5372 4932
5373static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
5374{
5375 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
5376 int j, nent = vcpu->arch.cpuid_nent;
5377
5378 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
5379 /* when no next entry is found, the current entry[i] is reselected */
5380 for (j = i + 1; ; j = (j + 1) % nent) {
5381 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
5382 if (ej->function == e->function) {
5383 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
5384 return j;
5385 }
5386 }
5387 return 0; /* silence gcc, even though control never reaches here */
5388}
5389
5390/* find an entry with matching function, matching index (if needed), and that
5391 * should be read next (if it's stateful) */
5392static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
5393 u32 function, u32 index)
5394{
5395 if (e->function != function)
5396 return 0;
5397 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
5398 return 0;
5399 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
5400 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
5401 return 0;
5402 return 1;
5403}
5404
5405struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
5406 u32 function, u32 index)
5407{
5408 int i;
5409 struct kvm_cpuid_entry2 *best = NULL;
5410
5411 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
5412 struct kvm_cpuid_entry2 *e;
5413
5414 e = &vcpu->arch.cpuid_entries[i];
5415 if (is_matching_cpuid_entry(e, function, index)) {
5416 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
5417 move_to_next_stateful_cpuid_entry(vcpu, i);
5418 best = e;
5419 break;
5420 }
5421 }
5422 return best;
5423}
5424EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
5425
5426int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
5427{
5428 struct kvm_cpuid_entry2 *best;
5429
5430 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
5431 if (!best || best->eax < 0x80000008)
5432 goto not_found;
5433 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
5434 if (best)
5435 return best->eax & 0xff;
5436not_found:
5437 return 36;
5438}
5439
5440/*
5441 * If no match is found, check whether we exceed the vCPU's limit
5442 * and return the content of the highest valid _standard_ leaf instead.
5443 * This is to satisfy the CPUID specification.
5444 */
5445static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
5446 u32 function, u32 index)
5447{
5448 struct kvm_cpuid_entry2 *maxlevel;
5449
5450 maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
5451 if (!maxlevel || maxlevel->eax >= function)
5452 return NULL;
5453 if (function & 0x80000000) {
5454 maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
5455 if (!maxlevel)
5456 return NULL;
5457 }
5458 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
5459}
5460
5461void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
5462{
5463 u32 function, index;
5464 struct kvm_cpuid_entry2 *best;
5465
5466 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
5467 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
5468 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
5469 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
5470 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
5471 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
5472 best = kvm_find_cpuid_entry(vcpu, function, index);
5473
5474 if (!best)
5475 best = check_cpuid_limit(vcpu, function, index);
5476
5477 if (best) {
5478 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
5479 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
5480 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
5481 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
5482 }
5483 kvm_x86_ops->skip_emulated_instruction(vcpu);
5484 trace_kvm_cpuid(function,
5485 kvm_register_read(vcpu, VCPU_REGS_RAX),
5486 kvm_register_read(vcpu, VCPU_REGS_RBX),
5487 kvm_register_read(vcpu, VCPU_REGS_RCX),
5488 kvm_register_read(vcpu, VCPU_REGS_RDX));
5489}
5490EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
5491
5492/* 4933/*
5493 * Check if userspace requested an interrupt window, and that the 4934 * Check if userspace requested an interrupt window, and that the
5494 * interrupt window is open. 4935 * interrupt window is open.
@@ -5649,6 +5090,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5649 int r; 5090 int r;
5650 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5091 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5651 vcpu->run->request_interrupt_window; 5092 vcpu->run->request_interrupt_window;
5093 bool req_immediate_exit = 0;
5652 5094
5653 if (vcpu->requests) { 5095 if (vcpu->requests) {
5654 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 5096 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5688,7 +5130,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5688 record_steal_time(vcpu); 5130 record_steal_time(vcpu);
5689 if (kvm_check_request(KVM_REQ_NMI, vcpu)) 5131 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5690 process_nmi(vcpu); 5132 process_nmi(vcpu);
5691 5133 req_immediate_exit =
5134 kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
5135 if (kvm_check_request(KVM_REQ_PMU, vcpu))
5136 kvm_handle_pmu_event(vcpu);
5137 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5138 kvm_deliver_pmi(vcpu);
5692 } 5139 }
5693 5140
5694 r = kvm_mmu_reload(vcpu); 5141 r = kvm_mmu_reload(vcpu);
@@ -5739,6 +5186,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5739 5186
5740 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5187 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5741 5188
5189 if (req_immediate_exit)
5190 smp_send_reschedule(vcpu->cpu);
5191
5742 kvm_guest_enter(); 5192 kvm_guest_enter();
5743 5193
5744 if (unlikely(vcpu->arch.switch_db_regs)) { 5194 if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -5944,10 +5394,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5944 if (r <= 0) 5394 if (r <= 0)
5945 goto out; 5395 goto out;
5946 5396
5947 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
5948 kvm_register_write(vcpu, VCPU_REGS_RAX,
5949 kvm_run->hypercall.ret);
5950
5951 r = __vcpu_run(vcpu); 5397 r = __vcpu_run(vcpu);
5952 5398
5953out: 5399out:
@@ -6149,7 +5595,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6149 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5595 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
6150 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5596 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
6151 if (sregs->cr4 & X86_CR4_OSXSAVE) 5597 if (sregs->cr4 & X86_CR4_OSXSAVE)
6152 update_cpuid(vcpu); 5598 kvm_update_cpuid(vcpu);
6153 5599
6154 idx = srcu_read_lock(&vcpu->kvm->srcu); 5600 idx = srcu_read_lock(&vcpu->kvm->srcu);
6155 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5601 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
@@ -6426,6 +5872,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6426 kvm_async_pf_hash_reset(vcpu); 5872 kvm_async_pf_hash_reset(vcpu);
6427 vcpu->arch.apf.halted = false; 5873 vcpu->arch.apf.halted = false;
6428 5874
5875 kvm_pmu_reset(vcpu);
5876
6429 return kvm_x86_ops->vcpu_reset(vcpu); 5877 return kvm_x86_ops->vcpu_reset(vcpu);
6430} 5878}
6431 5879
@@ -6474,10 +5922,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6474 kvm = vcpu->kvm; 5922 kvm = vcpu->kvm;
6475 5923
6476 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 5924 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6477 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
6478 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
6479 vcpu->arch.mmu.translate_gpa = translate_gpa;
6480 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
6481 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5925 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
6482 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5926 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6483 else 5927 else
@@ -6514,6 +5958,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6514 goto fail_free_mce_banks; 5958 goto fail_free_mce_banks;
6515 5959
6516 kvm_async_pf_hash_reset(vcpu); 5960 kvm_async_pf_hash_reset(vcpu);
5961 kvm_pmu_init(vcpu);
6517 5962
6518 return 0; 5963 return 0;
6519fail_free_mce_banks: 5964fail_free_mce_banks:
@@ -6532,6 +5977,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6532{ 5977{
6533 int idx; 5978 int idx;
6534 5979
5980 kvm_pmu_destroy(vcpu);
6535 kfree(vcpu->arch.mce_banks); 5981 kfree(vcpu->arch.mce_banks);
6536 kvm_free_lapic(vcpu); 5982 kvm_free_lapic(vcpu);
6537 idx = srcu_read_lock(&vcpu->kvm->srcu); 5983 idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d36fe237c665..cb80c293cdd8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -33,9 +33,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
33 return (nr == BP_VECTOR) || (nr == OF_VECTOR); 33 return (nr == BP_VECTOR) || (nr == OF_VECTOR);
34} 34}
35 35
36struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
37 u32 function, u32 index);
38
39static inline bool is_protmode(struct kvm_vcpu *vcpu) 36static inline bool is_protmode(struct kvm_vcpu *vcpu)
40{ 37{
41 return kvm_read_cr0_bits(vcpu, X86_CR0_PE); 38 return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
@@ -125,4 +122,6 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
125 gva_t addr, void *val, unsigned int bytes, 122 gva_t addr, void *val, unsigned int bytes,
126 struct x86_exception *exception); 123 struct x86_exception *exception);
127 124
125extern u64 host_xcr0;
126
128#endif 127#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index cf4603ba866f..642d8805bc1b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void)
856} 856}
857 857
858/* 858/*
859 * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so 859 * Interrupt descriptors are allocated as-needed, but low-numbered ones are
860 * rather than set them in lguest_init_IRQ we are called here every time an 860 * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
861 * lguest device needs an interrupt. 861 * tells us the irq is already used: other errors (ie. ENOMEM) we take
862 * 862 * seriously.
863 * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
864 * pass that up!
865 */ 863 */
866void lguest_setup_irq(unsigned int irq) 864int lguest_setup_irq(unsigned int irq)
867{ 865{
868 irq_alloc_desc_at(irq, 0); 866 int err;
867
868 /* Returns -ve error or vector number. */
869 err = irq_alloc_desc_at(irq, 0);
870 if (err < 0 && err != -EEXIST)
871 return err;
872
869 irq_set_chip_and_handler_name(irq, &lguest_irq_controller, 873 irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
870 handle_level_irq, "level"); 874 handle_level_irq, "level");
875 return 0;
871} 876}
872 877
873/* 878/*
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index 46fc4ee09fc4..88ad5fbda6e1 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -82,9 +82,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
82 const insn_attr_t *table; 82 const insn_attr_t *table;
83 if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) 83 if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
84 return 0; 84 return 0;
85 table = inat_avx_tables[vex_m][vex_p]; 85 /* At first, this checks the master table */
86 table = inat_avx_tables[vex_m][0];
86 if (!table) 87 if (!table)
87 return 0; 88 return 0;
89 if (!inat_is_group(table[opcode]) && vex_p) {
90 /* If this is not a group, get attribute directly */
91 table = inat_avx_tables[vex_m][vex_p];
92 if (!table)
93 return 0;
94 }
88 return table[opcode]; 95 return table[opcode];
89} 96}
90 97
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 374562ed6704..5a1f9f3e3fbb 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -202,7 +202,7 @@ void insn_get_opcode(struct insn *insn)
202 m = insn_vex_m_bits(insn); 202 m = insn_vex_m_bits(insn);
203 p = insn_vex_p_bits(insn); 203 p = insn_vex_p_bits(insn);
204 insn->attr = inat_get_avx_attribute(op, m, p); 204 insn->attr = inat_get_avx_attribute(op, m, p);
205 if (!inat_accept_vex(insn->attr)) 205 if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
206 insn->attr = 0; /* This instruction is bad */ 206 insn->attr = 0; /* This instruction is bad */
207 goto end; /* VEX has only 1 byte for opcode */ 207 goto end; /* VEX has only 1 byte for opcode */
208 } 208 }
@@ -249,6 +249,8 @@ void insn_get_modrm(struct insn *insn)
249 pfx = insn_last_prefix(insn); 249 pfx = insn_last_prefix(insn);
250 insn->attr = inat_get_group_attribute(mod, pfx, 250 insn->attr = inat_get_group_attribute(mod, pfx,
251 insn->attr); 251 insn->attr);
252 if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
253 insn->attr = 0; /* This is bad */
252 } 254 }
253 } 255 }
254 256
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 82004d2bf05e..bd59090825db 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -164,15 +164,13 @@ EXPORT_SYMBOL(strchr);
164size_t strlen(const char *s) 164size_t strlen(const char *s)
165{ 165{
166 int d0; 166 int d0;
167 int res; 167 size_t res;
168 asm volatile("repne\n\t" 168 asm volatile("repne\n\t"
169 "scasb\n\t" 169 "scasb"
170 "notl %0\n\t"
171 "decl %0"
172 : "=c" (res), "=&D" (d0) 170 : "=c" (res), "=&D" (d0)
173 : "1" (s), "a" (0), "0" (0xffffffffu) 171 : "1" (s), "a" (0), "0" (0xffffffffu)
174 : "memory"); 172 : "memory");
175 return res; 173 return ~res - 1;
176} 174}
177EXPORT_SYMBOL(strlen); 175EXPORT_SYMBOL(strlen);
178#endif 176#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index a793da5e560e..819137904428 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1,5 +1,11 @@
1# x86 Opcode Maps 1# x86 Opcode Maps
2# 2#
3# This is (mostly) based on following documentations.
4# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2
5# (#325383-040US, October 2011)
6# - Intel(R) Advanced Vector Extensions Programming Reference
7# (#319433-011,JUNE 2011).
8#
3#<Opcode maps> 9#<Opcode maps>
4# Table: table-name 10# Table: table-name
5# Referrer: escaped-name 11# Referrer: escaped-name
@@ -15,10 +21,13 @@
15# EndTable 21# EndTable
16# 22#
17# AVX Superscripts 23# AVX Superscripts
18# (VEX): this opcode can accept VEX prefix. 24# (v): this opcode requires VEX prefix.
19# (oVEX): this opcode requires VEX prefix. 25# (v1): this opcode only supports 128bit VEX.
20# (o128): this opcode only supports 128bit VEX. 26#
21# (o256): this opcode only supports 256bit VEX. 27# Last Prefix Superscripts
28# - (66): the last prefix is 0x66
29# - (F3): the last prefix is 0xF3
30# - (F2): the last prefix is 0xF2
22# 31#
23 32
24Table: one byte opcode 33Table: one byte opcode
@@ -199,8 +208,8 @@ a0: MOV AL,Ob
199a1: MOV rAX,Ov 208a1: MOV rAX,Ov
200a2: MOV Ob,AL 209a2: MOV Ob,AL
201a3: MOV Ov,rAX 210a3: MOV Ov,rAX
202a4: MOVS/B Xb,Yb 211a4: MOVS/B Yb,Xb
203a5: MOVS/W/D/Q Xv,Yv 212a5: MOVS/W/D/Q Yv,Xv
204a6: CMPS/B Xb,Yb 213a6: CMPS/B Xb,Yb
205a7: CMPS/W/D Xv,Yv 214a7: CMPS/W/D Xv,Yv
206a8: TEST AL,Ib 215a8: TEST AL,Ib
@@ -210,7 +219,9 @@ ab: STOS/W/D/Q Yv,rAX
210ac: LODS/B AL,Xb 219ac: LODS/B AL,Xb
211ad: LODS/W/D/Q rAX,Xv 220ad: LODS/W/D/Q rAX,Xv
212ae: SCAS/B AL,Yb 221ae: SCAS/B AL,Yb
213af: SCAS/W/D/Q rAX,Xv 222# Note: The May 2011 Intel manual shows Xv for the second parameter of the
223# next instruction but Yv is correct
224af: SCAS/W/D/Q rAX,Yv
214# 0xb0 - 0xbf 225# 0xb0 - 0xbf
215b0: MOV AL/R8L,Ib 226b0: MOV AL/R8L,Ib
216b1: MOV CL/R9L,Ib 227b1: MOV CL/R9L,Ib
@@ -233,8 +244,8 @@ c0: Grp2 Eb,Ib (1A)
233c1: Grp2 Ev,Ib (1A) 244c1: Grp2 Ev,Ib (1A)
234c2: RETN Iw (f64) 245c2: RETN Iw (f64)
235c3: RETN 246c3: RETN
236c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix) 247c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
237c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix) 248c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
238c6: Grp11 Eb,Ib (1A) 249c6: Grp11 Eb,Ib (1A)
239c7: Grp11 Ev,Iz (1A) 250c7: Grp11 Ev,Iz (1A)
240c8: ENTER Iw,Ib 251c8: ENTER Iw,Ib
@@ -320,14 +331,19 @@ AVXcode: 1
320# 3DNow! uses the last imm byte as opcode extension. 331# 3DNow! uses the last imm byte as opcode extension.
3210f: 3DNow! Pq,Qq,Ib 3320f: 3DNow! Pq,Qq,Ib
322# 0x0f 0x10-0x1f 333# 0x0f 0x10-0x1f
32310: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128) 334# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands
32411: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128) 335# but it actually has operands. And also, vmovss and vmovsd only accept 128bit.
32512: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX) 336# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form.
32613: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128) 337# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming
32714: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX) 338# Reference A.1
32815: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX) 33910: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1)
32916: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX) 34011: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1)
33017: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128) 34112: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2)
34213: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1)
34314: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66)
34415: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66)
34516: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3)
34617: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
33118: Grp16 (1A) 34718: Grp16 (1A)
33219: 34819:
3331a: 3491a:
@@ -345,14 +361,14 @@ AVXcode: 1
34525: 36125:
34626: 36226:
34727: 36327:
34828: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX) 36428: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66)
34929: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX) 36529: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66)
3502a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128) 3662a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1)
3512b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX) 3672b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66)
3522c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128) 3682c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1)
3532d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128) 3692d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1)
3542e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128) 3702e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1)
3552f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128) 3712f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1)
356# 0x0f 0x30-0x3f 372# 0x0f 0x30-0x3f
35730: WRMSR 37330: WRMSR
35831: RDTSC 37431: RDTSC
@@ -388,65 +404,66 @@ AVXcode: 1
3884e: CMOVLE/NG Gv,Ev 4044e: CMOVLE/NG Gv,Ev
3894f: CMOVNLE/G Gv,Ev 4054f: CMOVNLE/G Gv,Ev
390# 0x0f 0x50-0x5f 406# 0x0f 0x50-0x5f
39150: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX) 40750: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66)
39251: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128) 40851: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1)
39352: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128) 40952: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1)
39453: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128) 41053: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1)
39554: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX) 41154: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66)
39655: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX) 41255: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66)
39756: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX) 41356: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66)
39857: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX) 41457: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66)
39958: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128) 41558: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1)
40059: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128) 41659: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1)
4015a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128) 4175a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1)
4025b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX) 4185b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3)
4035c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128) 4195c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1)
4045d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128) 4205d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1)
4055e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128) 4215e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1)
4065f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128) 4225f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1)
407# 0x0f 0x60-0x6f 423# 0x0f 0x60-0x6f
40860: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128) 42460: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1)
40961: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128) 42561: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1)
41062: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128) 42662: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1)
41163: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128) 42763: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1)
41264: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128) 42864: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1)
41365: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128) 42965: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1)
41466: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128) 43066: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1)
41567: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128) 43167: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1)
41668: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128) 43268: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1)
41769: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128) 43369: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1)
4186a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128) 4346a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1)
4196b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128) 4356b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1)
4206c: punpcklqdq Vdq,Wdq (66),(VEX),(o128) 4366c: vpunpcklqdq Vx,Hx,Wx (66),(v1)
4216d: punpckhqdq Vdq,Wdq (66),(VEX),(o128) 4376d: vpunpckhqdq Vx,Hx,Wx (66),(v1)
4226e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128) 4386e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1)
4236f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX) 4396f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3)
424# 0x0f 0x70-0x7f 440# 0x0f 0x70-0x7f
42570: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128) 44170: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1)
42671: Grp12 (1A) 44271: Grp12 (1A)
42772: Grp13 (1A) 44372: Grp13 (1A)
42873: Grp14 (1A) 44473: Grp14 (1A)
42974: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128) 44574: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1)
43075: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128) 44675: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1)
43176: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128) 44776: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1)
43277: emms/vzeroupper/vzeroall (VEX) 448# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX.
43378: VMREAD Ed/q,Gd/q 44977: emms | vzeroupper | vzeroall
43479: VMWRITE Gd/q,Ed/q 45078: VMREAD Ey,Gy
45179: VMWRITE Gy,Ey
4357a: 4527a:
4367b: 4537b:
4377c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX) 4547c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2)
4387d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX) 4557d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2)
4397e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128) 4567e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
4407f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX) 4577f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
441# 0x0f 0x80-0x8f 458# 0x0f 0x80-0x8f
44280: JO Jz (f64) 45980: JO Jz (f64)
44381: JNO Jz (f64) 46081: JNO Jz (f64)
44482: JB/JNAE/JC Jz (f64) 46182: JB/JC/JNAE Jz (f64)
44583: JNB/JAE/JNC Jz (f64) 46283: JAE/JNB/JNC Jz (f64)
44684: JZ/JE Jz (f64) 46384: JE/JZ Jz (f64)
44785: JNZ/JNE Jz (f64) 46485: JNE/JNZ Jz (f64)
44886: JBE/JNA Jz (f64) 46586: JBE/JNA Jz (f64)
44987: JNBE/JA Jz (f64) 46687: JA/JNBE Jz (f64)
45088: JS Jz (f64) 46788: JS Jz (f64)
45189: JNS Jz (f64) 46889: JNS Jz (f64)
4528a: JP/JPE Jz (f64) 4698a: JP/JPE Jz (f64)
@@ -502,18 +519,18 @@ b8: JMPE | POPCNT Gv,Ev (F3)
502b9: Grp10 (1A) 519b9: Grp10 (1A)
503ba: Grp8 Ev,Ib (1A) 520ba: Grp8 Ev,Ib (1A)
504bb: BTC Ev,Gv 521bb: BTC Ev,Gv
505bc: BSF Gv,Ev 522bc: BSF Gv,Ev | TZCNT Gv,Ev (F3)
506bd: BSR Gv,Ev 523bd: BSR Gv,Ev | LZCNT Gv,Ev (F3)
507be: MOVSX Gv,Eb 524be: MOVSX Gv,Eb
508bf: MOVSX Gv,Ew 525bf: MOVSX Gv,Ew
509# 0x0f 0xc0-0xcf 526# 0x0f 0xc0-0xcf
510c0: XADD Eb,Gb 527c0: XADD Eb,Gb
511c1: XADD Ev,Gv 528c1: XADD Ev,Gv
512c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX) 529c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1)
513c3: movnti Md/q,Gd/q 530c3: movnti My,Gy
514c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128) 531c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1)
515c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128) 532c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1)
516c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX) 533c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66)
517c7: Grp9 (1A) 534c7: Grp9 (1A)
518c8: BSWAP RAX/EAX/R8/R8D 535c8: BSWAP RAX/EAX/R8/R8D
519c9: BSWAP RCX/ECX/R9/R9D 536c9: BSWAP RCX/ECX/R9/R9D
@@ -524,55 +541,55 @@ cd: BSWAP RBP/EBP/R13/R13D
524ce: BSWAP RSI/ESI/R14/R14D 541ce: BSWAP RSI/ESI/R14/R14D
525cf: BSWAP RDI/EDI/R15/R15D 542cf: BSWAP RDI/EDI/R15/R15D
526# 0x0f 0xd0-0xdf 543# 0x0f 0xd0-0xdf
527d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX) 544d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2)
528d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128) 545d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1)
529d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128) 546d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1)
530d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128) 547d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1)
531d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128) 548d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1)
532d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128) 549d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1)
533d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) 550d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
534d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128) 551d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1)
535d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128) 552d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1)
536d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128) 553d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1)
537da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128) 554da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1)
538db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128) 555db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1)
539dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128) 556dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1)
540dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128) 557dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1)
541de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128) 558de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1)
542df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128) 559df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1)
543# 0x0f 0xe0-0xef 560# 0x0f 0xe0-0xef
544e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128) 561e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1)
545e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128) 562e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1)
546e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128) 563e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1)
547e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128) 564e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1)
548e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128) 565e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1)
549e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128) 566e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1)
550e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX) 567e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2)
551e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX) 568e7: movntq Mq,Pq | vmovntdq Mx,Vx (66)
552e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128) 569e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1)
553e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128) 570e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1)
554ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128) 571ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1)
555eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128) 572eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1)
556ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128) 573ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1)
557ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128) 574ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1)
558ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128) 575ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1)
559ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128) 576ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1)
560# 0x0f 0xf0-0xff 577# 0x0f 0xf0-0xff
561f0: lddqu Vdq,Mdq (F2),(VEX) 578f0: vlddqu Vx,Mx (F2)
562f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128) 579f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1)
563f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128) 580f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1)
564f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128) 581f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1)
565f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128) 582f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1)
566f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128) 583f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1)
567f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128) 584f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1)
568f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128) 585f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1)
569f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128) 586f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1)
570f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128) 587f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1)
571fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128) 588fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1)
572fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128) 589fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
573fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128) 590fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
574fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128) 591fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
575fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128) 592fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
576ff: 593ff:
577EndTable 594EndTable
578 595
@@ -580,155 +597,193 @@ Table: 3-byte opcode 1 (0x0f 0x38)
580Referrer: 3-byte escape 1 597Referrer: 3-byte escape 1
581AVXcode: 2 598AVXcode: 2
582# 0x0f 0x38 0x00-0x0f 599# 0x0f 0x38 0x00-0x0f
58300: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128) 60000: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1)
58401: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128) 60101: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1)
58502: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128) 60202: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1)
58603: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128) 60303: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1)
58704: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128) 60404: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1)
58805: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128) 60505: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1)
58906: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128) 60606: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1)
59007: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128) 60707: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1)
59108: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128) 60808: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1)
59209: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128) 60909: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1)
5930a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128) 6100a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1)
5940b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128) 6110b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1)
5950c: Vpermilps /r (66),(oVEX) 6120c: vpermilps Vx,Hx,Wx (66),(v)
5960d: Vpermilpd /r (66),(oVEX) 6130d: vpermilpd Vx,Hx,Wx (66),(v)
5970e: vtestps /r (66),(oVEX) 6140e: vtestps Vx,Wx (66),(v)
5980f: vtestpd /r (66),(oVEX) 6150f: vtestpd Vx,Wx (66),(v)
599# 0x0f 0x38 0x10-0x1f 616# 0x0f 0x38 0x10-0x1f
60010: pblendvb Vdq,Wdq (66) 61710: pblendvb Vdq,Wdq (66)
60111: 61811:
60212: 61912:
60313: 62013: vcvtph2ps Vx,Wx,Ib (66),(v)
60414: blendvps Vdq,Wdq (66) 62114: blendvps Vdq,Wdq (66)
60515: blendvpd Vdq,Wdq (66) 62215: blendvpd Vdq,Wdq (66)
60616: 62316: vpermps Vqq,Hqq,Wqq (66),(v)
60717: ptest Vdq,Wdq (66),(VEX) 62417: vptest Vx,Wx (66)
60818: vbroadcastss /r (66),(oVEX) 62518: vbroadcastss Vx,Wd (66),(v)
60919: vbroadcastsd /r (66),(oVEX),(o256) 62619: vbroadcastsd Vqq,Wq (66),(v)
6101a: vbroadcastf128 /r (66),(oVEX),(o256) 6271a: vbroadcastf128 Vqq,Mdq (66),(v)
6111b: 6281b:
6121c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128) 6291c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1)
6131d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128) 6301d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1)
6141e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128) 6311e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1)
6151f: 6321f:
616# 0x0f 0x38 0x20-0x2f 633# 0x0f 0x38 0x20-0x2f
61720: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128) 63420: vpmovsxbw Vx,Ux/Mq (66),(v1)
61821: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128) 63521: vpmovsxbd Vx,Ux/Md (66),(v1)
61922: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128) 63622: vpmovsxbq Vx,Ux/Mw (66),(v1)
62023: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128) 63723: vpmovsxwd Vx,Ux/Mq (66),(v1)
62124: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128) 63824: vpmovsxwq Vx,Ux/Md (66),(v1)
62225: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128) 63925: vpmovsxdq Vx,Ux/Mq (66),(v1)
62326: 64026:
62427: 64127:
62528: pmuldq Vdq,Wdq (66),(VEX),(o128) 64228: vpmuldq Vx,Hx,Wx (66),(v1)
62629: pcmpeqq Vdq,Wdq (66),(VEX),(o128) 64329: vpcmpeqq Vx,Hx,Wx (66),(v1)
6272a: movntdqa Vdq,Mdq (66),(VEX),(o128) 6442a: vmovntdqa Vx,Mx (66),(v1)
6282b: packusdw Vdq,Wdq (66),(VEX),(o128) 6452b: vpackusdw Vx,Hx,Wx (66),(v1)
6292c: vmaskmovps(ld) /r (66),(oVEX) 6462c: vmaskmovps Vx,Hx,Mx (66),(v)
6302d: vmaskmovpd(ld) /r (66),(oVEX) 6472d: vmaskmovpd Vx,Hx,Mx (66),(v)
6312e: vmaskmovps(st) /r (66),(oVEX) 6482e: vmaskmovps Mx,Hx,Vx (66),(v)
6322f: vmaskmovpd(st) /r (66),(oVEX) 6492f: vmaskmovpd Mx,Hx,Vx (66),(v)
633# 0x0f 0x38 0x30-0x3f 650# 0x0f 0x38 0x30-0x3f
63430: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128) 65130: vpmovzxbw Vx,Ux/Mq (66),(v1)
63531: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128) 65231: vpmovzxbd Vx,Ux/Md (66),(v1)
63632: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128) 65332: vpmovzxbq Vx,Ux/Mw (66),(v1)
63733: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128) 65433: vpmovzxwd Vx,Ux/Mq (66),(v1)
63834: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128) 65534: vpmovzxwq Vx,Ux/Md (66),(v1)
63935: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128) 65635: vpmovzxdq Vx,Ux/Mq (66),(v1)
64036: 65736: vpermd Vqq,Hqq,Wqq (66),(v)
64137: pcmpgtq Vdq,Wdq (66),(VEX),(o128) 65837: vpcmpgtq Vx,Hx,Wx (66),(v1)
64238: pminsb Vdq,Wdq (66),(VEX),(o128) 65938: vpminsb Vx,Hx,Wx (66),(v1)
64339: pminsd Vdq,Wdq (66),(VEX),(o128) 66039: vpminsd Vx,Hx,Wx (66),(v1)
6443a: pminuw Vdq,Wdq (66),(VEX),(o128) 6613a: vpminuw Vx,Hx,Wx (66),(v1)
6453b: pminud Vdq,Wdq (66),(VEX),(o128) 6623b: vpminud Vx,Hx,Wx (66),(v1)
6463c: pmaxsb Vdq,Wdq (66),(VEX),(o128) 6633c: vpmaxsb Vx,Hx,Wx (66),(v1)
6473d: pmaxsd Vdq,Wdq (66),(VEX),(o128) 6643d: vpmaxsd Vx,Hx,Wx (66),(v1)
6483e: pmaxuw Vdq,Wdq (66),(VEX),(o128) 6653e: vpmaxuw Vx,Hx,Wx (66),(v1)
6493f: pmaxud Vdq,Wdq (66),(VEX),(o128) 6663f: vpmaxud Vx,Hx,Wx (66),(v1)
650# 0x0f 0x38 0x40-0x8f 667# 0x0f 0x38 0x40-0x8f
65140: pmulld Vdq,Wdq (66),(VEX),(o128) 66840: vpmulld Vx,Hx,Wx (66),(v1)
65241: phminposuw Vdq,Wdq (66),(VEX),(o128) 66941: vphminposuw Vdq,Wdq (66),(v1)
65380: INVEPT Gd/q,Mdq (66) 67042:
65481: INVPID Gd/q,Mdq (66) 67143:
67244:
67345: vpsrlvd/q Vx,Hx,Wx (66),(v)
67446: vpsravd Vx,Hx,Wx (66),(v)
67547: vpsllvd/q Vx,Hx,Wx (66),(v)
676# Skip 0x48-0x57
67758: vpbroadcastd Vx,Wx (66),(v)
67859: vpbroadcastq Vx,Wx (66),(v)
6795a: vbroadcasti128 Vqq,Mdq (66),(v)
680# Skip 0x5b-0x77
68178: vpbroadcastb Vx,Wx (66),(v)
68279: vpbroadcastw Vx,Wx (66),(v)
683# Skip 0x7a-0x7f
68480: INVEPT Gy,Mdq (66)
68581: INVPID Gy,Mdq (66)
68682: INVPCID Gy,Mdq (66)
6878c: vpmaskmovd/q Vx,Hx,Mx (66),(v)
6888e: vpmaskmovd/q Mx,Vx,Hx (66),(v)
655# 0x0f 0x38 0x90-0xbf (FMA) 689# 0x0f 0x38 0x90-0xbf (FMA)
65696: vfmaddsub132pd/ps /r (66),(VEX) 69090: vgatherdd/q Vx,Hx,Wx (66),(v)
65797: vfmsubadd132pd/ps /r (66),(VEX) 69191: vgatherqd/q Vx,Hx,Wx (66),(v)
65898: vfmadd132pd/ps /r (66),(VEX) 69292: vgatherdps/d Vx,Hx,Wx (66),(v)
65999: vfmadd132sd/ss /r (66),(VEX),(o128) 69393: vgatherqps/d Vx,Hx,Wx (66),(v)
6609a: vfmsub132pd/ps /r (66),(VEX) 69494:
6619b: vfmsub132sd/ss /r (66),(VEX),(o128) 69595:
6629c: vfnmadd132pd/ps /r (66),(VEX) 69696: vfmaddsub132ps/d Vx,Hx,Wx (66),(v)
6639d: vfnmadd132sd/ss /r (66),(VEX),(o128) 69797: vfmsubadd132ps/d Vx,Hx,Wx (66),(v)
6649e: vfnmsub132pd/ps /r (66),(VEX) 69898: vfmadd132ps/d Vx,Hx,Wx (66),(v)
6659f: vfnmsub132sd/ss /r (66),(VEX),(o128) 69999: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
666a6: vfmaddsub213pd/ps /r (66),(VEX) 7009a: vfmsub132ps/d Vx,Hx,Wx (66),(v)
667a7: vfmsubadd213pd/ps /r (66),(VEX) 7019b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
668a8: vfmadd213pd/ps /r (66),(VEX) 7029c: vfnmadd132ps/d Vx,Hx,Wx (66),(v)
669a9: vfmadd213sd/ss /r (66),(VEX),(o128) 7039d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
670aa: vfmsub213pd/ps /r (66),(VEX) 7049e: vfnmsub132ps/d Vx,Hx,Wx (66),(v)
671ab: vfmsub213sd/ss /r (66),(VEX),(o128) 7059f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
672ac: vfnmadd213pd/ps /r (66),(VEX) 706a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v)
673ad: vfnmadd213sd/ss /r (66),(VEX),(o128) 707a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v)
674ae: vfnmsub213pd/ps /r (66),(VEX) 708a8: vfmadd213ps/d Vx,Hx,Wx (66),(v)
675af: vfnmsub213sd/ss /r (66),(VEX),(o128) 709a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
676b6: vfmaddsub231pd/ps /r (66),(VEX) 710aa: vfmsub213ps/d Vx,Hx,Wx (66),(v)
677b7: vfmsubadd231pd/ps /r (66),(VEX) 711ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
678b8: vfmadd231pd/ps /r (66),(VEX) 712ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
679b9: vfmadd231sd/ss /r (66),(VEX),(o128) 713ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
680ba: vfmsub231pd/ps /r (66),(VEX) 714ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
681bb: vfmsub231sd/ss /r (66),(VEX),(o128) 715af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
682bc: vfnmadd231pd/ps /r (66),(VEX) 716b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
683bd: vfnmadd231sd/ss /r (66),(VEX),(o128) 717b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
684be: vfnmsub231pd/ps /r (66),(VEX) 718b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
685bf: vfnmsub231sd/ss /r (66),(VEX),(o128) 719b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
720ba: vfmsub231ps/d Vx,Hx,Wx (66),(v)
721bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
722bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v)
723bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
724be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
725bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
686# 0x0f 0x38 0xc0-0xff 726# 0x0f 0x38 0xc0-0xff
687db: aesimc Vdq,Wdq (66),(VEX),(o128) 727db: VAESIMC Vdq,Wdq (66),(v1)
688dc: aesenc Vdq,Wdq (66),(VEX),(o128) 728dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
689dd: aesenclast Vdq,Wdq (66),(VEX),(o128) 729dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
690de: aesdec Vdq,Wdq (66),(VEX),(o128) 730de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
691df: aesdeclast Vdq,Wdq (66),(VEX),(o128) 731df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
692f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) 732f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2)
693f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) 733f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2)
734f2: ANDN Gy,By,Ey (v)
735f3: Grp17 (1A)
736f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
737f6: MULX By,Gy,rDX,Ey (F2),(v)
738f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
694EndTable 739EndTable
695 740
696Table: 3-byte opcode 2 (0x0f 0x3a) 741Table: 3-byte opcode 2 (0x0f 0x3a)
697Referrer: 3-byte escape 2 742Referrer: 3-byte escape 2
698AVXcode: 3 743AVXcode: 3
699# 0x0f 0x3a 0x00-0xff 744# 0x0f 0x3a 0x00-0xff
70004: vpermilps /r,Ib (66),(oVEX) 74500: vpermq Vqq,Wqq,Ib (66),(v)
70105: vpermilpd /r,Ib (66),(oVEX) 74601: vpermpd Vqq,Wqq,Ib (66),(v)
70206: vperm2f128 /r,Ib (66),(oVEX),(o256) 74702: vpblendd Vx,Hx,Wx,Ib (66),(v)
70308: roundps Vdq,Wdq,Ib (66),(VEX) 74803:
70409: roundpd Vdq,Wdq,Ib (66),(VEX) 74904: vpermilps Vx,Wx,Ib (66),(v)
7050a: roundss Vss,Wss,Ib (66),(VEX),(o128) 75005: vpermilpd Vx,Wx,Ib (66),(v)
7060b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128) 75106: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
7070c: blendps Vdq,Wdq,Ib (66),(VEX) 75207:
7080d: blendpd Vdq,Wdq,Ib (66),(VEX) 75308: vroundps Vx,Wx,Ib (66)
7090e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128) 75409: vroundpd Vx,Wx,Ib (66)
7100f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128) 7550a: vroundss Vss,Wss,Ib (66),(v1)
71114: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128) 7560b: vroundsd Vsd,Wsd,Ib (66),(v1)
71215: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128) 7570c: vblendps Vx,Hx,Wx,Ib (66)
71316: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128) 7580d: vblendpd Vx,Hx,Wx,Ib (66)
71417: extractps Ed,Vdq,Ib (66),(VEX),(o128) 7590e: vpblendw Vx,Hx,Wx,Ib (66),(v1)
71518: vinsertf128 /r,Ib (66),(oVEX),(o256) 7600f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1)
71619: vextractf128 /r,Ib (66),(oVEX),(o256) 76114: vpextrb Rd/Mb,Vdq,Ib (66),(v1)
71720: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128) 76215: vpextrw Rd/Mw,Vdq,Ib (66),(v1)
71821: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128) 76316: vpextrd/q Ey,Vdq,Ib (66),(v1)
71922: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128) 76417: vextractps Ed,Vdq,Ib (66),(v1)
72040: dpps Vdq,Wdq,Ib (66),(VEX) 76518: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v)
72141: dppd Vdq,Wdq,Ib (66),(VEX),(o128) 76619: vextractf128 Wdq,Vqq,Ib (66),(v)
72242: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128) 7671d: vcvtps2ph Wx,Vx,Ib (66),(v)
72344: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128) 76820: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1)
7244a: vblendvps /r,Ib (66),(oVEX) 76921: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1)
7254b: vblendvpd /r,Ib (66),(oVEX) 77022: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
7264c: vpblendvb /r,Ib (66),(oVEX),(o128) 77138: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v)
72760: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128) 77239: vextracti128 Wdq,Vqq,Ib (66),(v)
72861: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128) 77340: vdpps Vx,Hx,Wx,Ib (66)
72962: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128) 77441: vdppd Vdq,Hdq,Wdq,Ib (66),(v1)
73063: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128) 77542: vmpsadbw Vx,Hx,Wx,Ib (66),(v1)
731df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128) 77644: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1)
77746: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v)
7784a: vblendvps Vx,Hx,Wx,Lx (66),(v)
7794b: vblendvpd Vx,Hx,Wx,Lx (66),(v)
7804c: vpblendvb Vx,Hx,Wx,Lx (66),(v1)
78160: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
78261: vpcmpestri Vdq,Wdq,Ib (66),(v1)
78362: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
78463: vpcmpistri Vdq,Wdq,Ib (66),(v1)
785df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
786f0: RORX Gy,Ey,Ib (F2),(v)
732EndTable 787EndTable
733 788
734GrpTable: Grp1 789GrpTable: Grp1
@@ -790,7 +845,7 @@ GrpTable: Grp5
7902: CALLN Ev (f64) 8452: CALLN Ev (f64)
7913: CALLF Ep 8463: CALLF Ep
7924: JMPN Ev (f64) 8474: JMPN Ev (f64)
7935: JMPF Ep 8485: JMPF Mp
7946: PUSH Ev (d64) 8496: PUSH Ev (d64)
7957: 8507:
796EndTable 851EndTable
@@ -807,7 +862,7 @@ EndTable
807GrpTable: Grp7 862GrpTable: Grp7
8080: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) 8630: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
8091: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) 8641: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
8102: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) 8652: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B)
8113: LIDT Ms 8663: LIDT Ms
8124: SMSW Mw/Rv 8674: SMSW Mw/Rv
8135: 8685:
@@ -824,44 +879,45 @@ EndTable
824 879
825GrpTable: Grp9 880GrpTable: Grp9
8261: CMPXCHG8B/16B Mq/Mdq 8811: CMPXCHG8B/16B Mq/Mdq
8276: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) 8826: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
8287: VMPTRST Mq 8837: VMPTRST Mq | VMPTRST Mq (F3)
829EndTable 884EndTable
830 885
831GrpTable: Grp10 886GrpTable: Grp10
832EndTable 887EndTable
833 888
834GrpTable: Grp11 889GrpTable: Grp11
890# Note: the operands are given by group opcode
8350: MOV 8910: MOV
836EndTable 892EndTable
837 893
838GrpTable: Grp12 894GrpTable: Grp12
8392: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128) 8952: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1)
8404: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128) 8964: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1)
8416: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128) 8976: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1)
842EndTable 898EndTable
843 899
844GrpTable: Grp13 900GrpTable: Grp13
8452: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128) 9012: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1)
8464: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128) 9024: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1)
8476: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128) 9036: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1)
848EndTable 904EndTable
849 905
850GrpTable: Grp14 906GrpTable: Grp14
8512: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128) 9072: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1)
8523: psrldq Udq,Ib (66),(11B),(VEX),(o128) 9083: vpsrldq Hx,Ux,Ib (66),(11B),(v1)
8536: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128) 9096: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1)
8547: pslldq Udq,Ib (66),(11B),(VEX),(o128) 9107: vpslldq Hx,Ux,Ib (66),(11B),(v1)
855EndTable 911EndTable
856 912
857GrpTable: Grp15 913GrpTable: Grp15
8580: fxsave 9140: fxsave | RDFSBASE Ry (F3),(11B)
8591: fxstor 9151: fxstor | RDGSBASE Ry (F3),(11B)
8602: ldmxcsr (VEX) 9162: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
8613: stmxcsr (VEX) 9173: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
8624: XSAVE 9184: XSAVE
8635: XRSTOR | lfence (11B) 9195: XRSTOR | lfence (11B)
8646: mfence (11B) 9206: XSAVEOPT | mfence (11B)
8657: clflush | sfence (11B) 9217: clflush | sfence (11B)
866EndTable 922EndTable
867 923
@@ -872,6 +928,12 @@ GrpTable: Grp16
8723: prefetch T2 9283: prefetch T2
873EndTable 929EndTable
874 930
931GrpTable: Grp17
9321: BLSR By,Ey (v)
9332: BLSMSK By,Ey (v)
9343: BLSI By,Ey (v)
935EndTable
936
875# AMD's Prefetch Group 937# AMD's Prefetch Group
876GrpTable: GrpP 938GrpTable: GrpP
8770: PREFETCH 9390: PREFETCH
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index d0474ad2a6e5..1fb85dbe390a 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs)
25 if (fixup) { 25 if (fixup) {
26 /* If fixup is less than 16, it means uaccess error */ 26 /* If fixup is less than 16, it means uaccess error */
27 if (fixup->fixup < 16) { 27 if (fixup->fixup < 16) {
28 current_thread_info()->uaccess_err = -EFAULT; 28 current_thread_info()->uaccess_err = 1;
29 regs->ip += fixup->fixup; 29 regs->ip += fixup->fixup;
30 return 1; 30 return 1;
31 } 31 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5db0490deb07..9d74824a708d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -626,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
626 626
627static noinline void 627static noinline void
628no_context(struct pt_regs *regs, unsigned long error_code, 628no_context(struct pt_regs *regs, unsigned long error_code,
629 unsigned long address) 629 unsigned long address, int signal, int si_code)
630{ 630{
631 struct task_struct *tsk = current; 631 struct task_struct *tsk = current;
632 unsigned long *stackend; 632 unsigned long *stackend;
@@ -634,8 +634,17 @@ no_context(struct pt_regs *regs, unsigned long error_code,
634 int sig; 634 int sig;
635 635
636 /* Are we prepared to handle this kernel fault? */ 636 /* Are we prepared to handle this kernel fault? */
637 if (fixup_exception(regs)) 637 if (fixup_exception(regs)) {
638 if (current_thread_info()->sig_on_uaccess_error && signal) {
639 tsk->thread.trap_no = 14;
640 tsk->thread.error_code = error_code | PF_USER;
641 tsk->thread.cr2 = address;
642
643 /* XXX: hwpoison faults will set the wrong code. */
644 force_sig_info_fault(signal, si_code, address, tsk, 0);
645 }
638 return; 646 return;
647 }
639 648
640 /* 649 /*
641 * 32-bit: 650 * 32-bit:
@@ -755,7 +764,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
755 if (is_f00f_bug(regs, address)) 764 if (is_f00f_bug(regs, address))
756 return; 765 return;
757 766
758 no_context(regs, error_code, address); 767 no_context(regs, error_code, address, SIGSEGV, si_code);
759} 768}
760 769
761static noinline void 770static noinline void
@@ -819,7 +828,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
819 828
820 /* Kernel mode? Handle exceptions or die: */ 829 /* Kernel mode? Handle exceptions or die: */
821 if (!(error_code & PF_USER)) { 830 if (!(error_code & PF_USER)) {
822 no_context(regs, error_code, address); 831 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
823 return; 832 return;
824 } 833 }
825 834
@@ -854,7 +863,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
854 if (!(fault & VM_FAULT_RETRY)) 863 if (!(fault & VM_FAULT_RETRY))
855 up_read(&current->mm->mmap_sem); 864 up_read(&current->mm->mmap_sem);
856 if (!(error_code & PF_USER)) 865 if (!(error_code & PF_USER))
857 no_context(regs, error_code, address); 866 no_context(regs, error_code, address, 0, 0);
858 return 1; 867 return 1;
859 } 868 }
860 if (!(fault & VM_FAULT_ERROR)) 869 if (!(fault & VM_FAULT_ERROR))
@@ -864,7 +873,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
864 /* Kernel mode? Handle exceptions or die: */ 873 /* Kernel mode? Handle exceptions or die: */
865 if (!(error_code & PF_USER)) { 874 if (!(error_code & PF_USER)) {
866 up_read(&current->mm->mmap_sem); 875 up_read(&current->mm->mmap_sem);
867 no_context(regs, error_code, address); 876 no_context(regs, error_code, address,
877 SIGSEGV, SEGV_MAPERR);
868 return 1; 878 return 1;
869 } 879 }
870 880
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ea305856151c..dd74e46828c0 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -201,6 +201,8 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
201 do { 201 do {
202 VM_BUG_ON(compound_head(page) != head); 202 VM_BUG_ON(compound_head(page) != head);
203 pages[*nr] = page; 203 pages[*nr] = page;
204 if (PageTail(page))
205 get_huge_page_tail(page);
204 (*nr)++; 206 (*nr)++;
205 page++; 207 page++;
206 refs++; 208 refs++;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index b49962662101..f4f29b19fac5 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -45,6 +45,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
45 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 45 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
46 BUG_ON(!pte_none(*(kmap_pte-idx))); 46 BUG_ON(!pte_none(*(kmap_pte-idx)));
47 set_pte(kmap_pte-idx, mk_pte(page, prot)); 47 set_pte(kmap_pte-idx, mk_pte(page, prot));
48 arch_flush_lazy_mmu_mode();
48 49
49 return (void *)vaddr; 50 return (void *)vaddr;
50} 51}
@@ -88,6 +89,7 @@ void __kunmap_atomic(void *kvaddr)
88 */ 89 */
89 kpte_clear_flush(kmap_pte-idx, vaddr); 90 kpte_clear_flush(kmap_pte-idx, vaddr);
90 kmap_atomic_idx_pop(); 91 kmap_atomic_idx_pop();
92 arch_flush_lazy_mmu_mode();
91 } 93 }
92#ifdef CONFIG_DEBUG_HIGHMEM 94#ifdef CONFIG_DEBUG_HIGHMEM
93 else { 95 else {
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a298914058f9..6cabf6570d64 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,6 +3,7 @@
3#include <linux/ioport.h> 3#include <linux/ioport.h>
4#include <linux/swap.h> 4#include <linux/swap.h>
5#include <linux/memblock.h> 5#include <linux/memblock.h>
6#include <linux/bootmem.h> /* for max_low_pfn */
6 7
7#include <asm/cacheflush.h> 8#include <asm/cacheflush.h>
8#include <asm/e820.h> 9#include <asm/e820.h>
@@ -15,6 +16,7 @@
15#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
16#include <asm/tlb.h> 17#include <asm/tlb.h>
17#include <asm/proto.h> 18#include <asm/proto.h>
19#include <asm/dma.h> /* for MAX_DMA_PFN */
18 20
19unsigned long __initdata pgt_buf_start; 21unsigned long __initdata pgt_buf_start;
20unsigned long __meminitdata pgt_buf_end; 22unsigned long __meminitdata pgt_buf_end;
@@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end)
392 free_init_pages("initrd memory", start, PAGE_ALIGN(end)); 394 free_init_pages("initrd memory", start, PAGE_ALIGN(end));
393} 395}
394#endif 396#endif
397
398void __init zone_sizes_init(void)
399{
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401
402 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
403
404#ifdef CONFIG_ZONE_DMA
405 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
406#endif
407#ifdef CONFIG_ZONE_DMA32
408 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
409#endif
410 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
411#ifdef CONFIG_HIGHMEM
412 max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
413#endif
414
415 free_area_init_nodes(max_zone_pfns);
416}
417
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0c1da394a634..8663f6c47ccb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -668,22 +668,6 @@ void __init initmem_init(void)
668} 668}
669#endif /* !CONFIG_NEED_MULTIPLE_NODES */ 669#endif /* !CONFIG_NEED_MULTIPLE_NODES */
670 670
671static void __init zone_sizes_init(void)
672{
673 unsigned long max_zone_pfns[MAX_NR_ZONES];
674 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
675#ifdef CONFIG_ZONE_DMA
676 max_zone_pfns[ZONE_DMA] =
677 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
678#endif
679 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
680#ifdef CONFIG_HIGHMEM
681 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
682#endif
683
684 free_area_init_nodes(max_zone_pfns);
685}
686
687void __init setup_bootmem_allocator(void) 671void __init setup_bootmem_allocator(void)
688{ 672{
689 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 673 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
@@ -754,6 +738,17 @@ void __init mem_init(void)
754#ifdef CONFIG_FLATMEM 738#ifdef CONFIG_FLATMEM
755 BUG_ON(!mem_map); 739 BUG_ON(!mem_map);
756#endif 740#endif
741 /*
742 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
743 * be done before free_all_bootmem(). Memblock use free low memory for
744 * temporary data (see find_range_array()) and for this purpose can use
745 * pages that was already passed to the buddy allocator, hence marked as
746 * not accessible in the page tables when compiled with
747 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
748 * important here.
749 */
750 set_highmem_pages_init();
751
757 /* this will put all low memory onto the freelists */ 752 /* this will put all low memory onto the freelists */
758 totalram_pages += free_all_bootmem(); 753 totalram_pages += free_all_bootmem();
759 754
@@ -765,8 +760,6 @@ void __init mem_init(void)
765 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 760 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
766 reservedpages++; 761 reservedpages++;
767 762
768 set_highmem_pages_init();
769
770 codesize = (unsigned long) &_etext - (unsigned long) &_text; 763 codesize = (unsigned long) &_etext - (unsigned long) &_text;
771 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 764 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
772 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 765 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a8a56ce3a962..436a0309db33 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -614,15 +614,6 @@ void __init initmem_init(void)
614 614
615void __init paging_init(void) 615void __init paging_init(void)
616{ 616{
617 unsigned long max_zone_pfns[MAX_NR_ZONES];
618
619 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
620#ifdef CONFIG_ZONE_DMA
621 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
622#endif
623 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
624 max_zone_pfns[ZONE_NORMAL] = max_pfn;
625
626 sparse_memory_present_with_active_regions(MAX_NUMNODES); 617 sparse_memory_present_with_active_regions(MAX_NUMNODES);
627 sparse_init(); 618 sparse_init();
628 619
@@ -634,7 +625,7 @@ void __init paging_init(void)
634 */ 625 */
635 node_clear_state(0, N_NORMAL_MEMORY); 626 node_clear_state(0, N_NORMAL_MEMORY);
636 627
637 free_area_init_nodes(max_zone_pfns); 628 zone_sizes_init();
638} 629}
639 630
640/* 631/*
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 4b5ba85eb5c9..845df6835f9f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void)
75 */ 75 */
76 if (current->flags & PF_RANDOMIZE) { 76 if (current->flags & PF_RANDOMIZE) {
77 if (mmap_is_ia32()) 77 if (mmap_is_ia32())
78 rnd = (long)get_random_int() % (1<<8); 78 rnd = get_random_int() % (1<<8);
79 else 79 else
80 rnd = (long)(get_random_int() % (1<<28)); 80 rnd = get_random_int() % (1<<28);
81 } 81 }
82 return rnd << PAGE_SHIFT; 82 return rnd << PAGE_SHIFT;
83} 83}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index de54b9b278a7..dc0b727742f4 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -75,8 +75,8 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
75 75
76/* module parameters */ 76/* module parameters */
77static unsigned long filter_offset; 77static unsigned long filter_offset;
78static int nommiotrace; 78static bool nommiotrace;
79static int trace_pc; 79static bool trace_pc;
80 80
81module_param(filter_offset, ulong, 0); 81module_param(filter_offset, ulong, 0);
82module_param(nommiotrace, bool, 0); 82module_param(nommiotrace, bool, 0);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 496f494593bf..19d3fa08b119 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu)
110 * Allocate node_to_cpumask_map based on number of available nodes 110 * Allocate node_to_cpumask_map based on number of available nodes
111 * Requires node_possible_map to be valid. 111 * Requires node_possible_map to be valid.
112 * 112 *
113 * Note: node_to_cpumask() is not valid until after this is done. 113 * Note: cpumask_of_node() is not valid until after this is done.
114 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 114 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
115 */ 115 */
116void __init setup_node_to_cpumask_map(void) 116void __init setup_node_to_cpumask_map(void)
@@ -422,8 +422,9 @@ static int __init numa_alloc_distance(void)
422 * calls are ignored until the distance table is reset with 422 * calls are ignored until the distance table is reset with
423 * numa_reset_distance(). 423 * numa_reset_distance().
424 * 424 *
425 * If @from or @to is higher than the highest known node at the time of 425 * If @from or @to is higher than the highest known node or lower than zero
426 * table creation or @distance doesn't make sense, the call is ignored. 426 * at the time of table creation or @distance doesn't make sense, the call
427 * is ignored.
427 * This is to allow simplification of specific NUMA config implementations. 428 * This is to allow simplification of specific NUMA config implementations.
428 */ 429 */
429void __init numa_set_distance(int from, int to, int distance) 430void __init numa_set_distance(int from, int to, int distance)
@@ -431,8 +432,9 @@ void __init numa_set_distance(int from, int to, int distance)
431 if (!numa_distance && numa_alloc_distance() < 0) 432 if (!numa_distance && numa_alloc_distance() < 0)
432 return; 433 return;
433 434
434 if (from >= numa_distance_cnt || to >= numa_distance_cnt) { 435 if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
435 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", 436 from < 0 || to < 0) {
437 pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
436 from, to, distance); 438 from, to, distance);
437 return; 439 return;
438 } 440 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f9e526742fa1..e1ebde315210 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -998,7 +998,7 @@ out_err:
998} 998}
999EXPORT_SYMBOL(set_memory_uc); 999EXPORT_SYMBOL(set_memory_uc);
1000 1000
1001int _set_memory_array(unsigned long *addr, int addrinarray, 1001static int _set_memory_array(unsigned long *addr, int addrinarray,
1002 unsigned long new_type) 1002 unsigned long new_type)
1003{ 1003{
1004 int i, j; 1004 int i, j;
@@ -1334,12 +1334,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1334 } 1334 }
1335 1335
1336 /* 1336 /*
1337 * If page allocator is not up yet then do not call c_p_a():
1338 */
1339 if (!debug_pagealloc_enabled)
1340 return;
1341
1342 /*
1343 * The return value is ignored as the calls cannot fail. 1337 * The return value is ignored as the calls cannot fail.
1344 * Large pages for identity mappings are not used at boot time 1338 * Large pages for identity mappings are not used at boot time
1345 * and hence no memory allocations during large page split. 1339 * and hence no memory allocations during large page split.
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 81dbfdeb080d..1c1c4f46a7c1 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -69,6 +69,12 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
69 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) 69 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
70 return; 70 return;
71 pxm = pa->proximity_domain; 71 pxm = pa->proximity_domain;
72 apic_id = pa->apic_id;
73 if (!cpu_has_x2apic && (apic_id >= 0xff)) {
74 printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
75 pxm, apic_id);
76 return;
77 }
72 node = setup_node(pxm); 78 node = setup_node(pxm);
73 if (node < 0) { 79 if (node < 0) {
74 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); 80 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -76,7 +82,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
76 return; 82 return;
77 } 83 }
78 84
79 apic_id = pa->apic_id;
80 if (apic_id >= MAX_LOCAL_APIC) { 85 if (apic_id >= MAX_LOCAL_APIC) {
81 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); 86 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
82 return; 87 return;
@@ -104,6 +109,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
104 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) 109 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
105 return; 110 return;
106 pxm = pa->proximity_domain_lo; 111 pxm = pa->proximity_domain_lo;
112 if (acpi_srat_revision >= 2)
113 pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
107 node = setup_node(pxm); 114 node = setup_node(pxm);
108 if (node < 0) { 115 if (node < 0) {
109 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); 116 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -155,6 +162,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
155 start = ma->base_address; 162 start = ma->base_address;
156 end = start + ma->length; 163 end = start + ma->length;
157 pxm = ma->proximity_domain; 164 pxm = ma->proximity_domain;
165 if (acpi_srat_revision <= 1)
166 pxm &= 0xff;
158 node = setup_node(pxm); 167 node = setup_node(pxm);
159 if (node < 0) { 168 if (node < 0) {
160 printk(KERN_ERR "SRAT: Too many proximity domains.\n"); 169 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bfab3fa10edc..7b65f752c5f8 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -568,8 +568,8 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
568 break; 568 break;
569 } 569 }
570 if (filter[i].jt != 0) { 570 if (filter[i].jt != 0) {
571 if (filter[i].jf) 571 if (filter[i].jf && f_offset)
572 t_offset += is_near(f_offset) ? 2 : 6; 572 t_offset += is_near(f_offset) ? 2 : 5;
573 EMIT_COND_JMP(t_op, t_offset); 573 EMIT_COND_JMP(t_op, t_offset);
574 if (filter[i].jf) 574 if (filter[i].jf)
575 EMIT_JMP(f_offset); 575 EMIT_JMP(f_offset);
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 446902b2a6b6..1599f568f0e2 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -4,9 +4,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
4 oprof.o cpu_buffer.o buffer_sync.o \ 4 oprof.o cpu_buffer.o buffer_sync.o \
5 event_buffer.o oprofile_files.o \ 5 event_buffer.o oprofile_files.o \
6 oprofilefs.o oprofile_stats.o \ 6 oprofilefs.o oprofile_stats.o \
7 timer_int.o ) 7 timer_int.o nmi_timer_int.o )
8 8
9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o 9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ 10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
11 op_model_ppro.o op_model_p4.o 11 op_model_ppro.o op_model_p4.o
12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
index cdfe4c54deca..9e138d00ad36 100644
--- a/arch/x86/oprofile/init.c
+++ b/arch/x86/oprofile/init.c
@@ -16,34 +16,23 @@
16 * with the NMI mode driver. 16 * with the NMI mode driver.
17 */ 17 */
18 18
19#ifdef CONFIG_X86_LOCAL_APIC
19extern int op_nmi_init(struct oprofile_operations *ops); 20extern int op_nmi_init(struct oprofile_operations *ops);
20extern int op_nmi_timer_init(struct oprofile_operations *ops);
21extern void op_nmi_exit(void); 21extern void op_nmi_exit(void);
22extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); 22#else
23static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
24static void op_nmi_exit(void) { }
25#endif
23 26
27extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
24 28
25int __init oprofile_arch_init(struct oprofile_operations *ops) 29int __init oprofile_arch_init(struct oprofile_operations *ops)
26{ 30{
27 int ret;
28
29 ret = -ENODEV;
30
31#ifdef CONFIG_X86_LOCAL_APIC
32 ret = op_nmi_init(ops);
33#endif
34#ifdef CONFIG_X86_IO_APIC
35 if (ret < 0)
36 ret = op_nmi_timer_init(ops);
37#endif
38 ops->backtrace = x86_backtrace; 31 ops->backtrace = x86_backtrace;
39 32 return op_nmi_init(ops);
40 return ret;
41} 33}
42 34
43
44void oprofile_arch_exit(void) 35void oprofile_arch_exit(void)
45{ 36{
46#ifdef CONFIG_X86_LOCAL_APIC
47 op_nmi_exit(); 37 op_nmi_exit();
48#endif
49} 38}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 75f9528e0372..26b8a8514ee5 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -595,24 +595,36 @@ static int __init p4_init(char **cpu_type)
595 return 0; 595 return 0;
596} 596}
597 597
598static int force_arch_perfmon; 598enum __force_cpu_type {
599static int force_cpu_type(const char *str, struct kernel_param *kp) 599 reserved = 0, /* do not force */
600 timer,
601 arch_perfmon,
602};
603
604static int force_cpu_type;
605
606static int set_cpu_type(const char *str, struct kernel_param *kp)
600{ 607{
601 if (!strcmp(str, "arch_perfmon")) { 608 if (!strcmp(str, "timer")) {
602 force_arch_perfmon = 1; 609 force_cpu_type = timer;
610 printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
611 } else if (!strcmp(str, "arch_perfmon")) {
612 force_cpu_type = arch_perfmon;
603 printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); 613 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
614 } else {
615 force_cpu_type = 0;
604 } 616 }
605 617
606 return 0; 618 return 0;
607} 619}
608module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); 620module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
609 621
610static int __init ppro_init(char **cpu_type) 622static int __init ppro_init(char **cpu_type)
611{ 623{
612 __u8 cpu_model = boot_cpu_data.x86_model; 624 __u8 cpu_model = boot_cpu_data.x86_model;
613 struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ 625 struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
614 626
615 if (force_arch_perfmon && cpu_has_arch_perfmon) 627 if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon)
616 return 0; 628 return 0;
617 629
618 /* 630 /*
@@ -679,6 +691,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
679 if (!cpu_has_apic) 691 if (!cpu_has_apic)
680 return -ENODEV; 692 return -ENODEV;
681 693
694 if (force_cpu_type == timer)
695 return -ENODEV;
696
682 switch (vendor) { 697 switch (vendor) {
683 case X86_VENDOR_AMD: 698 case X86_VENDOR_AMD:
684 /* Needs to be at least an Athlon (or hammer in 32bit mode) */ 699 /* Needs to be at least an Athlon (or hammer in 32bit mode) */
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
deleted file mode 100644
index 7f8052cd6620..000000000000
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ /dev/null
@@ -1,50 +0,0 @@
1/**
2 * @file nmi_timer_int.c
3 *
4 * @remark Copyright 2003 OProfile authors
5 * @remark Read the file COPYING
6 *
7 * @author Zwane Mwaikambo <zwane@linuxpower.ca>
8 */
9
10#include <linux/init.h>
11#include <linux/smp.h>
12#include <linux/errno.h>
13#include <linux/oprofile.h>
14#include <linux/rcupdate.h>
15#include <linux/kdebug.h>
16
17#include <asm/nmi.h>
18#include <asm/apic.h>
19#include <asm/ptrace.h>
20
21static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs)
22{
23 oprofile_add_sample(regs, 0);
24 return NMI_HANDLED;
25}
26
27static int timer_start(void)
28{
29 if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify,
30 0, "oprofile-timer"))
31 return 1;
32 return 0;
33}
34
35
36static void timer_stop(void)
37{
38 unregister_nmi_handler(NMI_LOCAL, "oprofile-timer");
39 synchronize_sched(); /* Allow already-started NMIs to complete. */
40}
41
42
43int __init op_nmi_timer_init(struct oprofile_operations *ops)
44{
45 ops->start = timer_start;
46 ops->stop = timer_stop;
47 ops->cpu_type = "timer";
48 printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
49 return 0;
50}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 6b8759f7634e..e76e18c94a3c 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,11 +15,12 @@ obj-$(CONFIG_X86_VISWS) += visws.o
15 15
16obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 16obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
17 17
18obj-$(CONFIG_X86_MRST) += mrst.o 18obj-$(CONFIG_X86_INTEL_MID) += mrst.o
19 19
20obj-y += common.o early.o 20obj-y += common.o early.o
21obj-y += amd_bus.o bus_numa.o 21obj-y += bus_numa.o
22 22
23obj-$(CONFIG_AMD_NB) += amd_bus.o
23obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o 24obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
24 25
25ifeq ($(CONFIG_PCI_DEBUG),y) 26ifeq ($(CONFIG_PCI_DEBUG),y)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 404f21a3ff9e..a312e76063a7 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,7 +12,7 @@ struct pci_root_info {
12 char *name; 12 char *name;
13 unsigned int res_num; 13 unsigned int res_num;
14 struct resource *res; 14 struct resource *res;
15 struct pci_bus *bus; 15 struct list_head *resources;
16 int busnum; 16 int busnum;
17}; 17};
18 18
@@ -24,6 +24,12 @@ static int __init set_use_crs(const struct dmi_system_id *id)
24 return 0; 24 return 0;
25} 25}
26 26
27static int __init set_nouse_crs(const struct dmi_system_id *id)
28{
29 pci_use_crs = false;
30 return 0;
31}
32
27static const struct dmi_system_id pci_use_crs_table[] __initconst = { 33static const struct dmi_system_id pci_use_crs_table[] __initconst = {
28 /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ 34 /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
29 { 35 {
@@ -54,6 +60,29 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
54 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), 60 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
55 }, 61 },
56 }, 62 },
63
64 /* Now for the blacklist.. */
65
66 /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
67 {
68 .callback = set_nouse_crs,
69 .ident = "Dell Studio 1557",
70 .matches = {
71 DMI_MATCH(DMI_BOARD_VENDOR, "Dell Inc."),
72 DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"),
73 DMI_MATCH(DMI_BIOS_VERSION, "A09"),
74 },
75 },
76 /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
77 {
78 .callback = set_nouse_crs,
79 .ident = "Thinkpad SL510",
80 .matches = {
81 DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
82 DMI_MATCH(DMI_BOARD_NAME, "2847DFG"),
83 DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"),
84 },
85 },
57 {} 86 {}
58}; 87};
59 88
@@ -149,7 +178,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
149 struct acpi_resource_address64 addr; 178 struct acpi_resource_address64 addr;
150 acpi_status status; 179 acpi_status status;
151 unsigned long flags; 180 unsigned long flags;
152 u64 start, end; 181 u64 start, orig_end, end;
153 182
154 status = resource_to_addr(acpi_res, &addr); 183 status = resource_to_addr(acpi_res, &addr);
155 if (!ACPI_SUCCESS(status)) 184 if (!ACPI_SUCCESS(status))
@@ -165,7 +194,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
165 return AE_OK; 194 return AE_OK;
166 195
167 start = addr.minimum + addr.translation_offset; 196 start = addr.minimum + addr.translation_offset;
168 end = addr.maximum + addr.translation_offset; 197 orig_end = end = addr.maximum + addr.translation_offset;
198
199 /* Exclude non-addressable range or non-addressable portion of range */
200 end = min(end, (u64)iomem_resource.end);
201 if (end <= start) {
202 dev_info(&info->bridge->dev,
203 "host bridge window [%#llx-%#llx] "
204 "(ignored, not CPU addressable)\n", start, orig_end);
205 return AE_OK;
206 } else if (orig_end != end) {
207 dev_info(&info->bridge->dev,
208 "host bridge window [%#llx-%#llx] "
209 "([%#llx-%#llx] ignored, not CPU addressable)\n",
210 start, orig_end, end + 1, orig_end);
211 }
169 212
170 res = &info->res[info->res_num]; 213 res = &info->res[info->res_num];
171 res->name = info->name; 214 res->name = info->name;
@@ -261,23 +304,20 @@ static void add_resources(struct pci_root_info *info)
261 "ignoring host bridge window %pR (conflicts with %s %pR)\n", 304 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
262 res, conflict->name, conflict); 305 res, conflict->name, conflict);
263 else 306 else
264 pci_bus_add_resource(info->bus, res, 0); 307 pci_add_resource(info->resources, res);
265 } 308 }
266} 309}
267 310
268static void 311static void
269get_current_resources(struct acpi_device *device, int busnum, 312get_current_resources(struct acpi_device *device, int busnum,
270 int domain, struct pci_bus *bus) 313 int domain, struct list_head *resources)
271{ 314{
272 struct pci_root_info info; 315 struct pci_root_info info;
273 size_t size; 316 size_t size;
274 317
275 if (pci_use_crs)
276 pci_bus_remove_resources(bus);
277
278 info.bridge = device; 318 info.bridge = device;
279 info.bus = bus;
280 info.res_num = 0; 319 info.res_num = 0;
320 info.resources = resources;
281 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 321 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
282 &info); 322 &info);
283 if (!info.res_num) 323 if (!info.res_num)
@@ -286,7 +326,7 @@ get_current_resources(struct acpi_device *device, int busnum,
286 size = sizeof(*info.res) * info.res_num; 326 size = sizeof(*info.res) * info.res_num;
287 info.res = kmalloc(size, GFP_KERNEL); 327 info.res = kmalloc(size, GFP_KERNEL);
288 if (!info.res) 328 if (!info.res)
289 goto res_alloc_fail; 329 return;
290 330
291 info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); 331 info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
292 if (!info.name) 332 if (!info.name)
@@ -301,8 +341,6 @@ get_current_resources(struct acpi_device *device, int busnum,
301 341
302name_alloc_fail: 342name_alloc_fail:
303 kfree(info.res); 343 kfree(info.res);
304res_alloc_fail:
305 return;
306} 344}
307 345
308struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) 346struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -310,6 +348,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
310 struct acpi_device *device = root->device; 348 struct acpi_device *device = root->device;
311 int domain = root->segment; 349 int domain = root->segment;
312 int busnum = root->secondary.start; 350 int busnum = root->secondary.start;
351 LIST_HEAD(resources);
313 struct pci_bus *bus; 352 struct pci_bus *bus;
314 struct pci_sysdata *sd; 353 struct pci_sysdata *sd;
315 int node; 354 int node;
@@ -364,11 +403,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
364 memcpy(bus->sysdata, sd, sizeof(*sd)); 403 memcpy(bus->sysdata, sd, sizeof(*sd));
365 kfree(sd); 404 kfree(sd);
366 } else { 405 } else {
367 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); 406 get_current_resources(device, busnum, domain, &resources);
368 if (bus) { 407 if (list_empty(&resources))
369 get_current_resources(device, busnum, domain, bus); 408 x86_pci_root_bus_resources(busnum, &resources);
409 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
410 &resources);
411 if (bus)
370 bus->subordinate = pci_scan_child_bus(bus); 412 bus->subordinate = pci_scan_child_bus(bus);
371 } 413 else
414 pci_free_resource_list(&resources);
372 } 415 }
373 416
374 /* After the PCI-E bus has been walked and all devices discovered, 417 /* After the PCI-E bus has been walked and all devices discovered,
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 026e4931d162..0567df3890e1 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = {
30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, 30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
31}; 31};
32 32
33static u64 __initdata fam10h_mmconf_start;
34static u64 __initdata fam10h_mmconf_end;
35static void __init get_pci_mmcfg_amd_fam10h_range(void)
36{
37 u32 address;
38 u64 base, msr;
39 unsigned segn_busn_bits;
40
41 /* assume all cpus from fam10h have mmconf */
42 if (boot_cpu_data.x86 < 0x10)
43 return;
44
45 address = MSR_FAM10H_MMIO_CONF_BASE;
46 rdmsrl(address, msr);
47
48 /* mmconfig is not enable */
49 if (!(msr & FAM10H_MMIO_CONF_ENABLE))
50 return;
51
52 base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
53
54 segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
55 FAM10H_MMIO_CONF_BUSRANGE_MASK;
56
57 fam10h_mmconf_start = base;
58 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
59}
60
61#define RANGE_NUM 16 33#define RANGE_NUM 16
62 34
63/** 35/**
@@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void)
85 u64 val; 57 u64 val;
86 u32 address; 58 u32 address;
87 bool found; 59 bool found;
60 struct resource fam10h_mmconf_res, *fam10h_mmconf;
61 u64 fam10h_mmconf_start;
62 u64 fam10h_mmconf_end;
88 63
89 if (!early_pci_allowed()) 64 if (!early_pci_allowed())
90 return -1; 65 return -1;
@@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void)
211 subtract_range(range, RANGE_NUM, 0, end); 186 subtract_range(range, RANGE_NUM, 0, end);
212 187
213 /* get mmconfig */ 188 /* get mmconfig */
214 get_pci_mmcfg_amd_fam10h_range(); 189 fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res);
215 /* need to take out mmconf range */ 190 /* need to take out mmconf range */
216 if (fam10h_mmconf_end) { 191 if (fam10h_mmconf) {
217 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); 192 printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf);
193 fam10h_mmconf_start = fam10h_mmconf->start;
194 fam10h_mmconf_end = fam10h_mmconf->end;
218 subtract_range(range, RANGE_NUM, fam10h_mmconf_start, 195 subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
219 fam10h_mmconf_end + 1); 196 fam10h_mmconf_end + 1);
197 } else {
198 fam10h_mmconf_start = 0;
199 fam10h_mmconf_end = 0;
220 } 200 }
221 201
222 /* mmio resource */ 202 /* mmio resource */
@@ -403,7 +383,6 @@ static void __init pci_enable_pci_io_ecs(void)
403 ++n; 383 ++n;
404 } 384 }
405 } 385 }
406 pr_info("Extended Config Space enabled on %u nodes\n", n);
407#endif 386#endif
408} 387}
409 388
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index ab8269b0da29..f3a7c569a403 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -15,10 +15,11 @@
15#include <linux/pci.h> 15#include <linux/pci.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <asm/pci_x86.h> 17#include <asm/pci_x86.h>
18#include <asm/pci-direct.h>
18 19
19#include "bus_numa.h" 20#include "bus_numa.h"
20 21
21static void __devinit cnb20le_res(struct pci_dev *dev) 22static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
22{ 23{
23 struct pci_root_info *info; 24 struct pci_root_info *info;
24 struct resource res; 25 struct resource res;
@@ -26,21 +27,12 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
26 u8 fbus, lbus; 27 u8 fbus, lbus;
27 int i; 28 int i;
28 29
29#ifdef CONFIG_ACPI
30 /*
31 * We should get host bridge information from ACPI unless the BIOS
32 * doesn't support it.
33 */
34 if (acpi_os_get_root_pointer())
35 return;
36#endif
37
38 info = &pci_root_info[pci_root_num]; 30 info = &pci_root_info[pci_root_num];
39 pci_root_num++; 31 pci_root_num++;
40 32
41 /* read the PCI bus numbers */ 33 /* read the PCI bus numbers */
42 pci_read_config_byte(dev, 0x44, &fbus); 34 fbus = read_pci_config_byte(bus, slot, func, 0x44);
43 pci_read_config_byte(dev, 0x45, &lbus); 35 lbus = read_pci_config_byte(bus, slot, func, 0x45);
44 info->bus_min = fbus; 36 info->bus_min = fbus;
45 info->bus_max = lbus; 37 info->bus_max = lbus;
46 38
@@ -59,8 +51,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
59 } 51 }
60 52
61 /* read the non-prefetchable memory window */ 53 /* read the non-prefetchable memory window */
62 pci_read_config_word(dev, 0xc0, &word1); 54 word1 = read_pci_config_16(bus, slot, func, 0xc0);
63 pci_read_config_word(dev, 0xc2, &word2); 55 word2 = read_pci_config_16(bus, slot, func, 0xc2);
64 if (word1 != word2) { 56 if (word1 != word2) {
65 res.start = (word1 << 16) | 0x0000; 57 res.start = (word1 << 16) | 0x0000;
66 res.end = (word2 << 16) | 0xffff; 58 res.end = (word2 << 16) | 0xffff;
@@ -69,8 +61,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
69 } 61 }
70 62
71 /* read the prefetchable memory window */ 63 /* read the prefetchable memory window */
72 pci_read_config_word(dev, 0xc4, &word1); 64 word1 = read_pci_config_16(bus, slot, func, 0xc4);
73 pci_read_config_word(dev, 0xc6, &word2); 65 word2 = read_pci_config_16(bus, slot, func, 0xc6);
74 if (word1 != word2) { 66 if (word1 != word2) {
75 res.start = (word1 << 16) | 0x0000; 67 res.start = (word1 << 16) | 0x0000;
76 res.end = (word2 << 16) | 0xffff; 68 res.end = (word2 << 16) | 0xffff;
@@ -79,8 +71,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
79 } 71 }
80 72
81 /* read the IO port window */ 73 /* read the IO port window */
82 pci_read_config_word(dev, 0xd0, &word1); 74 word1 = read_pci_config_16(bus, slot, func, 0xd0);
83 pci_read_config_word(dev, 0xd2, &word2); 75 word2 = read_pci_config_16(bus, slot, func, 0xd2);
84 if (word1 != word2) { 76 if (word1 != word2) {
85 res.start = word1; 77 res.start = word1;
86 res.end = word2; 78 res.end = word2;
@@ -92,13 +84,37 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
92 res.start = fbus; 84 res.start = fbus;
93 res.end = lbus; 85 res.end = lbus;
94 res.flags = IORESOURCE_BUS; 86 res.flags = IORESOURCE_BUS;
95 dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n", 87 printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res);
96 pci_domain_nr(dev->bus), &res);
97 88
98 for (i = 0; i < info->res_num; i++) 89 for (i = 0; i < info->res_num; i++)
99 dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]); 90 printk(KERN_INFO "host bridge window %pR\n", &info->res[i]);
100} 91}
101 92
102DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE, 93static int __init broadcom_postcore_init(void)
103 cnb20le_res); 94{
95 u8 bus = 0, slot = 0;
96 u32 id;
97 u16 vendor, device;
98
99#ifdef CONFIG_ACPI
100 /*
101 * We should get host bridge information from ACPI unless the BIOS
102 * doesn't support it.
103 */
104 if (acpi_os_get_root_pointer())
105 return 0;
106#endif
107
108 id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
109 vendor = id & 0xffff;
110 device = (id >> 16) & 0xffff;
111
112 if (vendor == PCI_VENDOR_ID_SERVERWORKS &&
113 device == PCI_DEVICE_ID_SERVERWORKS_LE) {
114 cnb20le_res(bus, slot, 0);
115 cnb20le_res(bus, slot, 1);
116 }
117 return 0;
118}
104 119
120postcore_initcall(broadcom_postcore_init);
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 64a122883896..fd3f65510e9d 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -7,45 +7,50 @@
7int pci_root_num; 7int pci_root_num;
8struct pci_root_info pci_root_info[PCI_ROOT_NR]; 8struct pci_root_info pci_root_info[PCI_ROOT_NR];
9 9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b) 10void x86_pci_root_bus_resources(int bus, struct list_head *resources)
11{ 11{
12 int i; 12 int i;
13 int j; 13 int j;
14 struct pci_root_info *info; 14 struct pci_root_info *info;
15 15
16 /* don't go for it if _CRS is used already */
17 if (b->resource[0] != &ioport_resource ||
18 b->resource[1] != &iomem_resource)
19 return;
20
21 if (!pci_root_num) 16 if (!pci_root_num)
22 return; 17 goto default_resources;
23 18
24 for (i = 0; i < pci_root_num; i++) { 19 for (i = 0; i < pci_root_num; i++) {
25 if (pci_root_info[i].bus_min == b->number) 20 if (pci_root_info[i].bus_min == bus)
26 break; 21 break;
27 } 22 }
28 23
29 if (i == pci_root_num) 24 if (i == pci_root_num)
30 return; 25 goto default_resources;
31 26
32 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", 27 printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
33 b->number); 28 bus);
34 29
35 pci_bus_remove_resources(b);
36 info = &pci_root_info[i]; 30 info = &pci_root_info[i];
37 for (j = 0; j < info->res_num; j++) { 31 for (j = 0; j < info->res_num; j++) {
38 struct resource *res; 32 struct resource *res;
39 struct resource *root; 33 struct resource *root;
40 34
41 res = &info->res[j]; 35 res = &info->res[j];
42 pci_bus_add_resource(b, res, 0); 36 pci_add_resource(resources, res);
43 if (res->flags & IORESOURCE_IO) 37 if (res->flags & IORESOURCE_IO)
44 root = &ioport_resource; 38 root = &ioport_resource;
45 else 39 else
46 root = &iomem_resource; 40 root = &iomem_resource;
47 insert_resource(root, res); 41 insert_resource(root, res);
48 } 42 }
43 return;
44
45default_resources:
46 /*
47 * We don't have any host bridge aperture information from the
48 * "native host bridge drivers," e.g., amd_bus or broadcom_bus,
49 * so fall back to the defaults historically used by pci_create_bus().
50 */
51 printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus);
52 pci_add_resource(resources, &ioport_resource);
53 pci_add_resource(resources, &iomem_resource);
49} 54}
50 55
51void __devinit update_res(struct pci_root_info *info, resource_size_t start, 56void __devinit update_res(struct pci_root_info *info, resource_size_t start,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7962ccb4d9b2..323481e06ef8 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -164,9 +164,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
164{ 164{
165 struct pci_dev *dev; 165 struct pci_dev *dev;
166 166
167 /* root bus? */
168 if (!b->parent)
169 x86_pci_root_bus_res_quirks(b);
170 pci_read_bridge_bases(b); 167 pci_read_bridge_bases(b);
171 list_for_each_entry(dev, &b->devices, bus_list) 168 list_for_each_entry(dev, &b->devices, bus_list)
172 pcibios_fixup_device_resources(dev); 169 pcibios_fixup_device_resources(dev);
@@ -433,6 +430,7 @@ void __init dmi_check_pciprobe(void)
433 430
434struct pci_bus * __devinit pcibios_scan_root(int busnum) 431struct pci_bus * __devinit pcibios_scan_root(int busnum)
435{ 432{
433 LIST_HEAD(resources);
436 struct pci_bus *bus = NULL; 434 struct pci_bus *bus = NULL;
437 struct pci_sysdata *sd; 435 struct pci_sysdata *sd;
438 436
@@ -456,9 +454,12 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
456 sd->node = get_mp_bus_to_node(busnum); 454 sd->node = get_mp_bus_to_node(busnum);
457 455
458 printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); 456 printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
459 bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); 457 x86_pci_root_bus_resources(busnum, &resources);
460 if (!bus) 458 bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources);
459 if (!bus) {
460 pci_free_resource_list(&resources);
461 kfree(sd); 461 kfree(sd);
462 }
462 463
463 return bus; 464 return bus;
464} 465}
@@ -639,6 +640,7 @@ int pci_ext_cfg_avail(struct pci_dev *dev)
639 640
640struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) 641struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
641{ 642{
643 LIST_HEAD(resources);
642 struct pci_bus *bus = NULL; 644 struct pci_bus *bus = NULL;
643 struct pci_sysdata *sd; 645 struct pci_sysdata *sd;
644 646
@@ -653,9 +655,12 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops,
653 return NULL; 655 return NULL;
654 } 656 }
655 sd->node = node; 657 sd->node = node;
656 bus = pci_scan_bus(busno, ops, sd); 658 x86_pci_root_bus_resources(busno, &resources);
657 if (!bus) 659 bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources);
660 if (!bus) {
661 pci_free_resource_list(&resources);
658 kfree(sd); 662 kfree(sd);
663 }
659 664
660 return bus; 665 return bus;
661} 666}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 794b092d01ae..91821a1a0c3a 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -254,26 +254,6 @@ void __init pcibios_resource_survey(void)
254 */ 254 */
255fs_initcall(pcibios_assign_resources); 255fs_initcall(pcibios_assign_resources);
256 256
257/*
258 * If we set up a device for bus mastering, we need to check the latency
259 * timer as certain crappy BIOSes forget to set it properly.
260 */
261unsigned int pcibios_max_latency = 255;
262
263void pcibios_set_master(struct pci_dev *dev)
264{
265 u8 lat;
266 pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
267 if (lat < 16)
268 lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
269 else if (lat > pcibios_max_latency)
270 lat = pcibios_max_latency;
271 else
272 return;
273 dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
274 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
275}
276
277static const struct vm_operations_struct pci_mmap_ops = { 257static const struct vm_operations_struct pci_mmap_ops = {
278 .access = generic_access_phys, 258 .access = generic_access_phys,
279}; 259};
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 2c2aeabc2609..a1df191129d3 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -31,9 +31,6 @@ int __init pci_legacy_init(void)
31 31
32 printk("PCI: Probing PCI hardware\n"); 32 printk("PCI: Probing PCI hardware\n");
33 pci_root_bus = pcibios_scan_root(0); 33 pci_root_bus = pcibios_scan_root(0);
34 if (pci_root_bus)
35 pci_bus_add_devices(pci_root_bus);
36
37 return 0; 34 return 0;
38} 35}
39 36
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 51abf02f9226..83e125b95ca6 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -153,8 +153,6 @@ int __init pci_numaq_init(void)
153 raw_pci_ops = &pci_direct_conf1_mq; 153 raw_pci_ops = &pci_direct_conf1_mq;
154 154
155 pci_root_bus = pcibios_scan_root(0); 155 pci_root_bus = pcibios_scan_root(0);
156 if (pci_root_bus)
157 pci_bus_add_devices(pci_root_bus);
158 if (num_online_nodes() > 1) 156 if (num_online_nodes() > 1)
159 for_each_online_node(quad) { 157 for_each_online_node(quad) {
160 if (quad == 0) 158 if (quad == 0)
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index db0e9a51e611..da8fe0535ff4 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -44,7 +44,7 @@ static inline void set_bios_x(void)
44 pcibios_enabled = 1; 44 pcibios_enabled = 1;
45 set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); 45 set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
46 if (__supported_pte_mask & _PAGE_NX) 46 if (__supported_pte_mask & _PAGE_NX)
47 printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n"); 47 printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n");
48} 48}
49 49
50/* 50/*
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 4a01967f02e7..4cf9bd0a1653 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -238,7 +238,8 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
238 238
239 spin_lock_irqsave(&rtc_lock, flags); 239 spin_lock_irqsave(&rtc_lock, flags);
240 efi_call_phys_prelog(); 240 efi_call_phys_prelog();
241 status = efi_call_phys2(efi_phys.get_time, tm, tc); 241 status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
242 virt_to_phys(tc));
242 efi_call_phys_epilog(); 243 efi_call_phys_epilog();
243 spin_unlock_irqrestore(&rtc_lock, flags); 244 spin_unlock_irqrestore(&rtc_lock, flags);
244 return status; 245 return status;
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index e36bf714cb77..40e446941dd7 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -39,43 +39,14 @@
39 */ 39 */
40 40
41static unsigned long efi_rt_eflags; 41static unsigned long efi_rt_eflags;
42static pgd_t efi_bak_pg_dir_pointer[2];
43 42
44void efi_call_phys_prelog(void) 43void efi_call_phys_prelog(void)
45{ 44{
46 unsigned long cr4;
47 unsigned long temp;
48 struct desc_ptr gdt_descr; 45 struct desc_ptr gdt_descr;
49 46
50 local_irq_save(efi_rt_eflags); 47 local_irq_save(efi_rt_eflags);
51 48
52 /* 49 load_cr3(initial_page_table);
53 * If I don't have PAE, I should just duplicate two entries in page
54 * directory. If I have PAE, I just need to duplicate one entry in
55 * page directory.
56 */
57 cr4 = read_cr4_safe();
58
59 if (cr4 & X86_CR4_PAE) {
60 efi_bak_pg_dir_pointer[0].pgd =
61 swapper_pg_dir[pgd_index(0)].pgd;
62 swapper_pg_dir[0].pgd =
63 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
64 } else {
65 efi_bak_pg_dir_pointer[0].pgd =
66 swapper_pg_dir[pgd_index(0)].pgd;
67 efi_bak_pg_dir_pointer[1].pgd =
68 swapper_pg_dir[pgd_index(0x400000)].pgd;
69 swapper_pg_dir[pgd_index(0)].pgd =
70 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
71 temp = PAGE_OFFSET + 0x400000;
72 swapper_pg_dir[pgd_index(0x400000)].pgd =
73 swapper_pg_dir[pgd_index(temp)].pgd;
74 }
75
76 /*
77 * After the lock is released, the original page table is restored.
78 */
79 __flush_tlb_all(); 50 __flush_tlb_all();
80 51
81 gdt_descr.address = __pa(get_cpu_gdt_table(0)); 52 gdt_descr.address = __pa(get_cpu_gdt_table(0));
@@ -85,28 +56,13 @@ void efi_call_phys_prelog(void)
85 56
86void efi_call_phys_epilog(void) 57void efi_call_phys_epilog(void)
87{ 58{
88 unsigned long cr4;
89 struct desc_ptr gdt_descr; 59 struct desc_ptr gdt_descr;
90 60
91 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); 61 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
92 gdt_descr.size = GDT_SIZE - 1; 62 gdt_descr.size = GDT_SIZE - 1;
93 load_gdt(&gdt_descr); 63 load_gdt(&gdt_descr);
94 64
95 cr4 = read_cr4_safe(); 65 load_cr3(swapper_pg_dir);
96
97 if (cr4 & X86_CR4_PAE) {
98 swapper_pg_dir[pgd_index(0)].pgd =
99 efi_bak_pg_dir_pointer[0].pgd;
100 } else {
101 swapper_pg_dir[pgd_index(0)].pgd =
102 efi_bak_pg_dir_pointer[0].pgd;
103 swapper_pg_dir[pgd_index(0x400000)].pgd =
104 efi_bak_pg_dir_pointer[1].pgd;
105 }
106
107 /*
108 * After the lock is released, the original page table is restored.
109 */
110 __flush_tlb_all(); 66 __flush_tlb_all();
111 67
112 local_irq_restore(efi_rt_eflags); 68 local_irq_restore(efi_rt_eflags);
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index ca1973699d3d..dc5f1d32aced 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -27,7 +27,7 @@
27 27
28#include <asm/geode.h> 28#include <asm/geode.h>
29 29
30static int force = 0; 30static bool force = 0;
31module_param(force, bool, 0444); 31module_param(force, bool, 0444);
32/* FIXME: Award bios is not automatically detected as Alix platform */ 32/* FIXME: Award bios is not automatically detected as Alix platform */
33MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform"); 33MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 1ba7f5ed8c9b..5917eb56b313 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
42MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); 42MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
43MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); 43MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
44 44
45static int force; 45static bool force;
46 46
47module_param(force, bool, 0); 47module_param(force, bool, 0);
48MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation."); 48MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index 1ea38775a6d3..7baed5135e0f 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_X86_MRST) += mrst.o 1obj-$(CONFIG_X86_INTEL_MID) += mrst.o
2obj-$(CONFIG_X86_MRST) += vrtc.o 2obj-$(CONFIG_X86_INTEL_MID) += vrtc.o
3obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o 3obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o
4obj-$(CONFIG_X86_MRST) += pmu.o 4obj-$(CONFIG_X86_MRST) += pmu.o
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 25bfdbb5b130..3c6e328483c7 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -245,16 +245,24 @@ struct console early_mrst_console = {
245 * Following is the early console based on Medfield HSU (High 245 * Following is the early console based on Medfield HSU (High
246 * Speed UART) device. 246 * Speed UART) device.
247 */ 247 */
248#define HSU_PORT2_PADDR 0xffa28180 248#define HSU_PORT_BASE 0xffa28080
249 249
250static void __iomem *phsu; 250static void __iomem *phsu;
251 251
252void hsu_early_console_init(void) 252void hsu_early_console_init(const char *s)
253{ 253{
254 unsigned long paddr, port = 0;
254 u8 lcr; 255 u8 lcr;
255 256
256 phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, 257 /*
257 HSU_PORT2_PADDR); 258 * Select the early HSU console port if specified by user in the
259 * kernel command line.
260 */
261 if (*s && !kstrtoul(s, 10, &port))
262 port = clamp_val(port, 0, 2);
263
264 paddr = HSU_PORT_BASE + port * 0x80;
265 phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
258 266
259 /* Disable FIFO */ 267 /* Disable FIFO */
260 writeb(0x0, phsu + UART_FCR); 268 writeb(0x0, phsu + UART_FCR);
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index b1489a06a49d..475e2cd0f3c3 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -76,6 +76,20 @@ struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
76EXPORT_SYMBOL_GPL(sfi_mrtc_array); 76EXPORT_SYMBOL_GPL(sfi_mrtc_array);
77int sfi_mrtc_num; 77int sfi_mrtc_num;
78 78
79static void mrst_power_off(void)
80{
81 if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT)
82 intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 1);
83}
84
85static void mrst_reboot(void)
86{
87 if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT)
88 intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0);
89 else
90 intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0);
91}
92
79/* parse all the mtimer info to a static mtimer array */ 93/* parse all the mtimer info to a static mtimer array */
80static int __init sfi_parse_mtmr(struct sfi_table_header *table) 94static int __init sfi_parse_mtmr(struct sfi_table_header *table)
81{ 95{
@@ -265,17 +279,6 @@ static int mrst_i8042_detect(void)
265 return 0; 279 return 0;
266} 280}
267 281
268/* Reboot and power off are handled by the SCU on a MID device */
269static void mrst_power_off(void)
270{
271 intel_scu_ipc_simple_command(0xf1, 1);
272}
273
274static void mrst_reboot(void)
275{
276 intel_scu_ipc_simple_command(0xf1, 0);
277}
278
279/* 282/*
280 * Moorestown does not have external NMI source nor port 0x61 to report 283 * Moorestown does not have external NMI source nor port 0x61 to report
281 * NMI status. The possible NMI sources are from pmu as a result of NMI 284 * NMI status. The possible NMI sources are from pmu as a result of NMI
@@ -484,6 +487,46 @@ static void __init *max7315_platform_data(void *info)
484 return max7315; 487 return max7315;
485} 488}
486 489
490static void *tca6416_platform_data(void *info)
491{
492 static struct pca953x_platform_data tca6416;
493 struct i2c_board_info *i2c_info = info;
494 int gpio_base, intr;
495 char base_pin_name[SFI_NAME_LEN + 1];
496 char intr_pin_name[SFI_NAME_LEN + 1];
497
498 strcpy(i2c_info->type, "tca6416");
499 strcpy(base_pin_name, "tca6416_base");
500 strcpy(intr_pin_name, "tca6416_int");
501
502 gpio_base = get_gpio_by_name(base_pin_name);
503 intr = get_gpio_by_name(intr_pin_name);
504
505 if (gpio_base == -1)
506 return NULL;
507 tca6416.gpio_base = gpio_base;
508 if (intr != -1) {
509 i2c_info->irq = intr + MRST_IRQ_OFFSET;
510 tca6416.irq_base = gpio_base + MRST_IRQ_OFFSET;
511 } else {
512 i2c_info->irq = -1;
513 tca6416.irq_base = -1;
514 }
515 return &tca6416;
516}
517
518static void *mpu3050_platform_data(void *info)
519{
520 struct i2c_board_info *i2c_info = info;
521 int intr = get_gpio_by_name("mpu3050_int");
522
523 if (intr == -1)
524 return NULL;
525
526 i2c_info->irq = intr + MRST_IRQ_OFFSET;
527 return NULL;
528}
529
487static void __init *emc1403_platform_data(void *info) 530static void __init *emc1403_platform_data(void *info)
488{ 531{
489 static short intr2nd_pdata; 532 static short intr2nd_pdata;
@@ -646,12 +689,15 @@ static void *msic_ocd_platform_data(void *info)
646static const struct devs_id __initconst device_ids[] = { 689static const struct devs_id __initconst device_ids[] = {
647 {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data}, 690 {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data},
648 {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, 691 {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
692 {"pmic_gpio", SFI_DEV_TYPE_IPC, 1, &pmic_gpio_platform_data},
649 {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data}, 693 {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
650 {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, 694 {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
651 {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, 695 {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
696 {"tca6416", SFI_DEV_TYPE_I2C, 1, &tca6416_platform_data},
652 {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data}, 697 {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
653 {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data}, 698 {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
654 {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, 699 {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
700 {"mpu3050", SFI_DEV_TYPE_I2C, 1, &mpu3050_platform_data},
655 701
656 /* MSIC subdevices */ 702 /* MSIC subdevices */
657 {"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data}, 703 {"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data},
@@ -802,8 +848,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry)
802 if (mrst_has_msic()) 848 if (mrst_has_msic())
803 return; 849 return;
804 850
805 /* ID as IRQ is a hack that will go away */ 851 pdev = platform_device_alloc(entry->name, 0);
806 pdev = platform_device_alloc(entry->name, entry->irq);
807 if (pdev == NULL) { 852 if (pdev == NULL) {
808 pr_err("out of memory for SFI platform device '%s'.\n", 853 pr_err("out of memory for SFI platform device '%s'.\n",
809 entry->name); 854 entry->name);
@@ -984,6 +1029,7 @@ static int __init pb_keys_init(void)
984 num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); 1029 num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
985 for (i = 0; i < num; i++) { 1030 for (i = 0; i < num; i++) {
986 gb[i].gpio = get_gpio_by_name(gb[i].desc); 1031 gb[i].gpio = get_gpio_by_name(gb[i].desc);
1032 pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio);
987 if (gb[i].gpio == -1) 1033 if (gb[i].gpio == -1)
988 continue; 1034 continue;
989 1035
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 5b552198f774..9be4cff00a2d 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub)
157 * clear of the Timeout bit (as well) will free the resource. No reply will 157 * clear of the Timeout bit (as well) will free the resource. No reply will
158 * be sent (the hardware will only do one reply per message). 158 * be sent (the hardware will only do one reply per message).
159 */ 159 */
160static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp) 160static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
161 int do_acknowledge)
161{ 162{
162 unsigned long dw; 163 unsigned long dw;
163 struct bau_pq_entry *msg; 164 struct bau_pq_entry *msg;
164 165
165 msg = mdp->msg; 166 msg = mdp->msg;
166 if (!msg->canceled) { 167 if (!msg->canceled && do_acknowledge) {
167 dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; 168 dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
168 write_mmr_sw_ack(dw); 169 write_mmr_sw_ack(dw);
169 } 170 }
@@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
212 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { 213 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
213 unsigned long mr; 214 unsigned long mr;
214 /* 215 /*
215 * is the resource timed out? 216 * Is the resource timed out?
216 * make everyone ignore the cancelled message. 217 * Make everyone ignore the cancelled message.
217 */ 218 */
218 msg2->canceled = 1; 219 msg2->canceled = 1;
219 stat->d_canceled++; 220 stat->d_canceled++;
@@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
231 * Do all the things a cpu should do for a TLB shootdown message. 232 * Do all the things a cpu should do for a TLB shootdown message.
232 * Other cpu's may come here at the same time for this message. 233 * Other cpu's may come here at the same time for this message.
233 */ 234 */
234static void bau_process_message(struct msg_desc *mdp, 235static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
235 struct bau_control *bcp) 236 int do_acknowledge)
236{ 237{
237 short socket_ack_count = 0; 238 short socket_ack_count = 0;
238 short *sp; 239 short *sp;
@@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp,
284 if (msg_ack_count == bcp->cpus_in_uvhub) { 285 if (msg_ack_count == bcp->cpus_in_uvhub) {
285 /* 286 /*
286 * All cpus in uvhub saw it; reply 287 * All cpus in uvhub saw it; reply
288 * (unless we are in the UV2 workaround)
287 */ 289 */
288 reply_to_message(mdp, bcp); 290 reply_to_message(mdp, bcp, do_acknowledge);
289 } 291 }
290 } 292 }
291 293
@@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
491/* 493/*
492 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. 494 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
493 */ 495 */
494static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu) 496static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
495{ 497{
496 unsigned long descriptor_status; 498 unsigned long descriptor_status;
497 unsigned long descriptor_status2; 499 unsigned long descriptor_status2;
498 500
499 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); 501 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
500 descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL; 502 descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL;
501 descriptor_status = (descriptor_status << 1) | descriptor_status2; 503 descriptor_status = (descriptor_status << 1) | descriptor_status2;
502 return descriptor_status; 504 return descriptor_status;
503} 505}
504 506
507/*
508 * Return whether the status of the descriptor that is normally used for this
509 * cpu (the one indexed by its hub-relative cpu number) is busy.
510 * The status of the original 32 descriptors is always reflected in the 64
511 * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0.
512 * The bit provided by the activation_status_2 register is irrelevant to
513 * the status if it is only being tested for busy or not busy.
514 */
515int normal_busy(struct bau_control *bcp)
516{
517 int cpu = bcp->uvhub_cpu;
518 int mmr_offset;
519 int right_shift;
520
521 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
522 right_shift = cpu * UV_ACT_STATUS_SIZE;
523 return (((((read_lmmr(mmr_offset) >> right_shift) &
524 UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY);
525}
526
527/*
528 * Entered when a bau descriptor has gone into a permanent busy wait because
529 * of a hardware bug.
530 * Workaround the bug.
531 */
532int handle_uv2_busy(struct bau_control *bcp)
533{
534 int busy_one = bcp->using_desc;
535 int normal = bcp->uvhub_cpu;
536 int selected = -1;
537 int i;
538 unsigned long descriptor_status;
539 unsigned long status;
540 int mmr_offset;
541 struct bau_desc *bau_desc_old;
542 struct bau_desc *bau_desc_new;
543 struct bau_control *hmaster = bcp->uvhub_master;
544 struct ptc_stats *stat = bcp->statp;
545 cycles_t ttm;
546
547 stat->s_uv2_wars++;
548 spin_lock(&hmaster->uvhub_lock);
549 /* try for the original first */
550 if (busy_one != normal) {
551 if (!normal_busy(bcp))
552 selected = normal;
553 }
554 if (selected < 0) {
555 /* can't use the normal, select an alternate */
556 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
557 descriptor_status = read_lmmr(mmr_offset);
558
559 /* scan available descriptors 32-63 */
560 for (i = 0; i < UV_CPUS_PER_AS; i++) {
561 if ((hmaster->inuse_map & (1 << i)) == 0) {
562 status = ((descriptor_status >>
563 (i * UV_ACT_STATUS_SIZE)) &
564 UV_ACT_STATUS_MASK) << 1;
565 if (status != UV2H_DESC_BUSY) {
566 selected = i + UV_CPUS_PER_AS;
567 break;
568 }
569 }
570 }
571 }
572
573 if (busy_one != normal)
574 /* mark the busy alternate as not in-use */
575 hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
576
577 if (selected >= 0) {
578 /* switch to the selected descriptor */
579 if (selected != normal) {
580 /* set the selected alternate as in-use */
581 hmaster->inuse_map |=
582 (1 << (selected - UV_CPUS_PER_AS));
583 if (selected > stat->s_uv2_wars_hw)
584 stat->s_uv2_wars_hw = selected;
585 }
586 bau_desc_old = bcp->descriptor_base;
587 bau_desc_old += (ITEMS_PER_DESC * busy_one);
588 bcp->using_desc = selected;
589 bau_desc_new = bcp->descriptor_base;
590 bau_desc_new += (ITEMS_PER_DESC * selected);
591 *bau_desc_new = *bau_desc_old;
592 } else {
593 /*
594 * All are busy. Wait for the normal one for this cpu to
595 * free up.
596 */
597 stat->s_uv2_war_waits++;
598 spin_unlock(&hmaster->uvhub_lock);
599 ttm = get_cycles();
600 do {
601 cpu_relax();
602 } while (normal_busy(bcp));
603 spin_lock(&hmaster->uvhub_lock);
604 /* switch to the original descriptor */
605 bcp->using_desc = normal;
606 bau_desc_old = bcp->descriptor_base;
607 bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
608 bcp->using_desc = (ITEMS_PER_DESC * normal);
609 bau_desc_new = bcp->descriptor_base;
610 bau_desc_new += (ITEMS_PER_DESC * normal);
611 *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
612 }
613 spin_unlock(&hmaster->uvhub_lock);
614 return FLUSH_RETRY_BUSYBUG;
615}
616
505static int uv2_wait_completion(struct bau_desc *bau_desc, 617static int uv2_wait_completion(struct bau_desc *bau_desc,
506 unsigned long mmr_offset, int right_shift, 618 unsigned long mmr_offset, int right_shift,
507 struct bau_control *bcp, long try) 619 struct bau_control *bcp, long try)
508{ 620{
509 unsigned long descriptor_stat; 621 unsigned long descriptor_stat;
510 cycles_t ttm; 622 cycles_t ttm;
511 int cpu = bcp->uvhub_cpu; 623 int desc = bcp->using_desc;
624 long busy_reps = 0;
512 struct ptc_stats *stat = bcp->statp; 625 struct ptc_stats *stat = bcp->statp;
513 626
514 descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); 627 descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);
515 628
516 /* spin on the status MMR, waiting for it to go idle */ 629 /* spin on the status MMR, waiting for it to go idle */
517 while (descriptor_stat != UV2H_DESC_IDLE) { 630 while (descriptor_stat != UV2H_DESC_IDLE) {
@@ -522,32 +635,35 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
522 * our message and its state will stay IDLE. 635 * our message and its state will stay IDLE.
523 */ 636 */
524 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || 637 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
525 (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
526 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { 638 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
527 stat->s_stimeout++; 639 stat->s_stimeout++;
528 return FLUSH_GIVEUP; 640 return FLUSH_GIVEUP;
641 } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) {
642 stat->s_strongnacks++;
643 bcp->conseccompletes = 0;
644 return FLUSH_GIVEUP;
529 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { 645 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
530 stat->s_dtimeout++; 646 stat->s_dtimeout++;
531 ttm = get_cycles();
532 /*
533 * Our retries may be blocked by all destination
534 * swack resources being consumed, and a timeout
535 * pending. In that case hardware returns the
536 * ERROR that looks like a destination timeout.
537 */
538 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
539 bcp->conseccompletes = 0;
540 return FLUSH_RETRY_PLUGGED;
541 }
542 bcp->conseccompletes = 0; 647 bcp->conseccompletes = 0;
543 return FLUSH_RETRY_TIMEOUT; 648 return FLUSH_RETRY_TIMEOUT;
544 } else { 649 } else {
650 busy_reps++;
651 if (busy_reps > 1000000) {
652 /* not to hammer on the clock */
653 busy_reps = 0;
654 ttm = get_cycles();
655 if ((ttm - bcp->send_message) >
656 (bcp->clocks_per_100_usec)) {
657 return handle_uv2_busy(bcp);
658 }
659 }
545 /* 660 /*
546 * descriptor_stat is still BUSY 661 * descriptor_stat is still BUSY
547 */ 662 */
548 cpu_relax(); 663 cpu_relax();
549 } 664 }
550 descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); 665 descriptor_stat = uv2_read_status(mmr_offset, right_shift,
666 desc);
551 } 667 }
552 bcp->conseccompletes++; 668 bcp->conseccompletes++;
553 return FLUSH_COMPLETE; 669 return FLUSH_COMPLETE;
@@ -563,17 +679,17 @@ static int wait_completion(struct bau_desc *bau_desc,
563{ 679{
564 int right_shift; 680 int right_shift;
565 unsigned long mmr_offset; 681 unsigned long mmr_offset;
566 int cpu = bcp->uvhub_cpu; 682 int desc = bcp->using_desc;
567 683
568 if (cpu < UV_CPUS_PER_AS) { 684 if (desc < UV_CPUS_PER_AS) {
569 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 685 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
570 right_shift = cpu * UV_ACT_STATUS_SIZE; 686 right_shift = desc * UV_ACT_STATUS_SIZE;
571 } else { 687 } else {
572 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; 688 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
573 right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); 689 right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
574 } 690 }
575 691
576 if (is_uv1_hub()) 692 if (bcp->uvhub_version == 1)
577 return uv1_wait_completion(bau_desc, mmr_offset, right_shift, 693 return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
578 bcp, try); 694 bcp, try);
579 else 695 else
@@ -752,19 +868,22 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
752 * Returns 1 if it gives up entirely and the original cpu mask is to be 868 * Returns 1 if it gives up entirely and the original cpu mask is to be
753 * returned to the kernel. 869 * returned to the kernel.
754 */ 870 */
755int uv_flush_send_and_wait(struct bau_desc *bau_desc, 871int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
756 struct cpumask *flush_mask, struct bau_control *bcp)
757{ 872{
758 int seq_number = 0; 873 int seq_number = 0;
759 int completion_stat = 0; 874 int completion_stat = 0;
875 int uv1 = 0;
760 long try = 0; 876 long try = 0;
761 unsigned long index; 877 unsigned long index;
762 cycles_t time1; 878 cycles_t time1;
763 cycles_t time2; 879 cycles_t time2;
764 struct ptc_stats *stat = bcp->statp; 880 struct ptc_stats *stat = bcp->statp;
765 struct bau_control *hmaster = bcp->uvhub_master; 881 struct bau_control *hmaster = bcp->uvhub_master;
882 struct uv1_bau_msg_header *uv1_hdr = NULL;
883 struct uv2_bau_msg_header *uv2_hdr = NULL;
884 struct bau_desc *bau_desc;
766 885
767 if (is_uv1_hub()) 886 if (bcp->uvhub_version == 1)
768 uv1_throttle(hmaster, stat); 887 uv1_throttle(hmaster, stat);
769 888
770 while (hmaster->uvhub_quiesce) 889 while (hmaster->uvhub_quiesce)
@@ -772,22 +891,39 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
772 891
773 time1 = get_cycles(); 892 time1 = get_cycles();
774 do { 893 do {
775 if (try == 0) { 894 bau_desc = bcp->descriptor_base;
776 bau_desc->header.msg_type = MSG_REGULAR; 895 bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
896 if (bcp->uvhub_version == 1) {
897 uv1 = 1;
898 uv1_hdr = &bau_desc->header.uv1_hdr;
899 } else
900 uv2_hdr = &bau_desc->header.uv2_hdr;
901 if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
902 if (uv1)
903 uv1_hdr->msg_type = MSG_REGULAR;
904 else
905 uv2_hdr->msg_type = MSG_REGULAR;
777 seq_number = bcp->message_number++; 906 seq_number = bcp->message_number++;
778 } else { 907 } else {
779 bau_desc->header.msg_type = MSG_RETRY; 908 if (uv1)
909 uv1_hdr->msg_type = MSG_RETRY;
910 else
911 uv2_hdr->msg_type = MSG_RETRY;
780 stat->s_retry_messages++; 912 stat->s_retry_messages++;
781 } 913 }
782 914
783 bau_desc->header.sequence = seq_number; 915 if (uv1)
784 index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; 916 uv1_hdr->sequence = seq_number;
917 else
918 uv2_hdr->sequence = seq_number;
919 index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc;
785 bcp->send_message = get_cycles(); 920 bcp->send_message = get_cycles();
786 921
787 write_mmr_activation(index); 922 write_mmr_activation(index);
788 923
789 try++; 924 try++;
790 completion_stat = wait_completion(bau_desc, bcp, try); 925 completion_stat = wait_completion(bau_desc, bcp, try);
926 /* UV2: wait_completion() may change the bcp->using_desc */
791 927
792 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); 928 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
793 929
@@ -798,6 +934,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
798 } 934 }
799 cpu_relax(); 935 cpu_relax();
800 } while ((completion_stat == FLUSH_RETRY_PLUGGED) || 936 } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
937 (completion_stat == FLUSH_RETRY_BUSYBUG) ||
801 (completion_stat == FLUSH_RETRY_TIMEOUT)); 938 (completion_stat == FLUSH_RETRY_TIMEOUT));
802 939
803 time2 = get_cycles(); 940 time2 = get_cycles();
@@ -812,6 +949,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
812 record_send_stats(time1, time2, bcp, stat, completion_stat, try); 949 record_send_stats(time1, time2, bcp, stat, completion_stat, try);
813 950
814 if (completion_stat == FLUSH_GIVEUP) 951 if (completion_stat == FLUSH_GIVEUP)
952 /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
815 return 1; 953 return 1;
816 return 0; 954 return 0;
817} 955}
@@ -967,7 +1105,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
967 stat->s_ntargself++; 1105 stat->s_ntargself++;
968 1106
969 bau_desc = bcp->descriptor_base; 1107 bau_desc = bcp->descriptor_base;
970 bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu; 1108 bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
971 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 1109 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
972 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) 1110 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
973 return NULL; 1111 return NULL;
@@ -980,13 +1118,86 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
980 * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 1118 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
981 * or 1 if it gave up and the original cpumask should be returned. 1119 * or 1 if it gave up and the original cpumask should be returned.
982 */ 1120 */
983 if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) 1121 if (!uv_flush_send_and_wait(flush_mask, bcp))
984 return NULL; 1122 return NULL;
985 else 1123 else
986 return cpumask; 1124 return cpumask;
987} 1125}
988 1126
989/* 1127/*
1128 * Search the message queue for any 'other' message with the same software
1129 * acknowledge resource bit vector.
1130 */
1131struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
1132 struct bau_control *bcp, unsigned char swack_vec)
1133{
1134 struct bau_pq_entry *msg_next = msg + 1;
1135
1136 if (msg_next > bcp->queue_last)
1137 msg_next = bcp->queue_first;
1138 while ((msg_next->swack_vec != 0) && (msg_next != msg)) {
1139 if (msg_next->swack_vec == swack_vec)
1140 return msg_next;
1141 msg_next++;
1142 if (msg_next > bcp->queue_last)
1143 msg_next = bcp->queue_first;
1144 }
1145 return NULL;
1146}
1147
1148/*
1149 * UV2 needs to work around a bug in which an arriving message has not
1150 * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
1151 * Such a message must be ignored.
1152 */
1153void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
1154{
1155 unsigned long mmr_image;
1156 unsigned char swack_vec;
1157 struct bau_pq_entry *msg = mdp->msg;
1158 struct bau_pq_entry *other_msg;
1159
1160 mmr_image = read_mmr_sw_ack();
1161 swack_vec = msg->swack_vec;
1162
1163 if ((swack_vec & mmr_image) == 0) {
1164 /*
1165 * This message was assigned a swack resource, but no
1166 * reserved acknowlegment is pending.
1167 * The bug has prevented this message from setting the MMR.
1168 * And no other message has used the same sw_ack resource.
1169 * Do the requested shootdown but do not reply to the msg.
1170 * (the 0 means make no acknowledge)
1171 */
1172 bau_process_message(mdp, bcp, 0);
1173 return;
1174 }
1175
1176 /*
1177 * Some message has set the MMR 'pending' bit; it might have been
1178 * another message. Look for that message.
1179 */
1180 other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
1181 if (other_msg) {
1182 /* There is another. Do not ack the current one. */
1183 bau_process_message(mdp, bcp, 0);
1184 /*
1185 * Let the natural processing of that message acknowledge
1186 * it. Don't get the processing of sw_ack's out of order.
1187 */
1188 return;
1189 }
1190
1191 /*
1192 * There is no other message using this sw_ack, so it is safe to
1193 * acknowledge it.
1194 */
1195 bau_process_message(mdp, bcp, 1);
1196
1197 return;
1198}
1199
1200/*
990 * The BAU message interrupt comes here. (registered by set_intr_gate) 1201 * The BAU message interrupt comes here. (registered by set_intr_gate)
991 * See entry_64.S 1202 * See entry_64.S
992 * 1203 *
@@ -1009,6 +1220,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
1009 struct ptc_stats *stat; 1220 struct ptc_stats *stat;
1010 struct msg_desc msgdesc; 1221 struct msg_desc msgdesc;
1011 1222
1223 ack_APIC_irq();
1012 time_start = get_cycles(); 1224 time_start = get_cycles();
1013 1225
1014 bcp = &per_cpu(bau_control, smp_processor_id()); 1226 bcp = &per_cpu(bau_control, smp_processor_id());
@@ -1022,9 +1234,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
1022 count++; 1234 count++;
1023 1235
1024 msgdesc.msg_slot = msg - msgdesc.queue_first; 1236 msgdesc.msg_slot = msg - msgdesc.queue_first;
1025 msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
1026 msgdesc.msg = msg; 1237 msgdesc.msg = msg;
1027 bau_process_message(&msgdesc, bcp); 1238 if (bcp->uvhub_version == 2)
1239 process_uv2_message(&msgdesc, bcp);
1240 else
1241 bau_process_message(&msgdesc, bcp, 1);
1028 1242
1029 msg++; 1243 msg++;
1030 if (msg > msgdesc.queue_last) 1244 if (msg > msgdesc.queue_last)
@@ -1036,8 +1250,6 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
1036 stat->d_nomsg++; 1250 stat->d_nomsg++;
1037 else if (count > 1) 1251 else if (count > 1)
1038 stat->d_multmsg++; 1252 stat->d_multmsg++;
1039
1040 ack_APIC_irq();
1041} 1253}
1042 1254
1043/* 1255/*
@@ -1083,7 +1295,7 @@ static void __init enable_timeouts(void)
1083 */ 1295 */
1084 mmr_image |= (1L << SOFTACK_MSHIFT); 1296 mmr_image |= (1L << SOFTACK_MSHIFT);
1085 if (is_uv2_hub()) { 1297 if (is_uv2_hub()) {
1086 mmr_image |= (1L << UV2_LEG_SHFT); 1298 mmr_image &= ~(1L << UV2_LEG_SHFT);
1087 mmr_image |= (1L << UV2_EXT_SHFT); 1299 mmr_image |= (1L << UV2_EXT_SHFT);
1088 } 1300 }
1089 write_mmr_misc_control(pnode, mmr_image); 1301 write_mmr_misc_control(pnode, mmr_image);
@@ -1136,13 +1348,13 @@ static int ptc_seq_show(struct seq_file *file, void *data)
1136 seq_printf(file, 1348 seq_printf(file,
1137 "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 1349 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
1138 seq_printf(file, 1350 seq_printf(file,
1139 "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok "); 1351 "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok ");
1140 seq_printf(file, 1352 seq_printf(file,
1141 "resetp resett giveup sto bz throt swack recv rtime "); 1353 "resetp resett giveup sto bz throt swack recv rtime ");
1142 seq_printf(file, 1354 seq_printf(file,
1143 "all one mult none retry canc nocan reset rcan "); 1355 "all one mult none retry canc nocan reset rcan ");
1144 seq_printf(file, 1356 seq_printf(file,
1145 "disable enable\n"); 1357 "disable enable wars warshw warwaits\n");
1146 } 1358 }
1147 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 1359 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
1148 stat = &per_cpu(ptcstats, cpu); 1360 stat = &per_cpu(ptcstats, cpu);
@@ -1154,10 +1366,10 @@ static int ptc_seq_show(struct seq_file *file, void *data)
1154 stat->s_ntargremotes, stat->s_ntargcpu, 1366 stat->s_ntargremotes, stat->s_ntargcpu,
1155 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, 1367 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
1156 stat->s_ntarguvhub, stat->s_ntarguvhub16); 1368 stat->s_ntarguvhub, stat->s_ntarguvhub16);
1157 seq_printf(file, "%ld %ld %ld %ld %ld ", 1369 seq_printf(file, "%ld %ld %ld %ld %ld %ld ",
1158 stat->s_ntarguvhub8, stat->s_ntarguvhub4, 1370 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
1159 stat->s_ntarguvhub2, stat->s_ntarguvhub1, 1371 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
1160 stat->s_dtimeout); 1372 stat->s_dtimeout, stat->s_strongnacks);
1161 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 1373 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
1162 stat->s_retry_messages, stat->s_retriesok, 1374 stat->s_retry_messages, stat->s_retriesok,
1163 stat->s_resets_plug, stat->s_resets_timeout, 1375 stat->s_resets_plug, stat->s_resets_timeout,
@@ -1173,8 +1385,10 @@ static int ptc_seq_show(struct seq_file *file, void *data)
1173 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1385 stat->d_nomsg, stat->d_retries, stat->d_canceled,
1174 stat->d_nocanceled, stat->d_resets, 1386 stat->d_nocanceled, stat->d_resets,
1175 stat->d_rcanceled); 1387 stat->d_rcanceled);
1176 seq_printf(file, "%ld %ld\n", 1388 seq_printf(file, "%ld %ld %ld %ld %ld\n",
1177 stat->s_bau_disabled, stat->s_bau_reenabled); 1389 stat->s_bau_disabled, stat->s_bau_reenabled,
1390 stat->s_uv2_wars, stat->s_uv2_wars_hw,
1391 stat->s_uv2_war_waits);
1178 } 1392 }
1179 return 0; 1393 return 0;
1180} 1394}
@@ -1432,12 +1646,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
1432{ 1646{
1433 int i; 1647 int i;
1434 int cpu; 1648 int cpu;
1649 int uv1 = 0;
1435 unsigned long gpa; 1650 unsigned long gpa;
1436 unsigned long m; 1651 unsigned long m;
1437 unsigned long n; 1652 unsigned long n;
1438 size_t dsize; 1653 size_t dsize;
1439 struct bau_desc *bau_desc; 1654 struct bau_desc *bau_desc;
1440 struct bau_desc *bd2; 1655 struct bau_desc *bd2;
1656 struct uv1_bau_msg_header *uv1_hdr;
1657 struct uv2_bau_msg_header *uv2_hdr;
1441 struct bau_control *bcp; 1658 struct bau_control *bcp;
1442 1659
1443 /* 1660 /*
@@ -1451,6 +1668,8 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
1451 gpa = uv_gpa(bau_desc); 1668 gpa = uv_gpa(bau_desc);
1452 n = uv_gpa_to_gnode(gpa); 1669 n = uv_gpa_to_gnode(gpa);
1453 m = uv_gpa_to_offset(gpa); 1670 m = uv_gpa_to_offset(gpa);
1671 if (is_uv1_hub())
1672 uv1 = 1;
1454 1673
1455 /* the 14-bit pnode */ 1674 /* the 14-bit pnode */
1456 write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); 1675 write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
@@ -1461,21 +1680,33 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
1461 */ 1680 */
1462 for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { 1681 for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
1463 memset(bd2, 0, sizeof(struct bau_desc)); 1682 memset(bd2, 0, sizeof(struct bau_desc));
1464 bd2->header.swack_flag = 1; 1683 if (uv1) {
1465 /* 1684 uv1_hdr = &bd2->header.uv1_hdr;
1466 * The base_dest_nasid set in the message header is the nasid 1685 uv1_hdr->swack_flag = 1;
1467 * of the first uvhub in the partition. The bit map will 1686 /*
1468 * indicate destination pnode numbers relative to that base. 1687 * The base_dest_nasid set in the message header
1469 * They may not be consecutive if nasid striding is being used. 1688 * is the nasid of the first uvhub in the partition.
1470 */ 1689 * The bit map will indicate destination pnode numbers
1471 bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); 1690 * relative to that base. They may not be consecutive
1472 bd2->header.dest_subnodeid = UV_LB_SUBNODEID; 1691 * if nasid striding is being used.
1473 bd2->header.command = UV_NET_ENDPOINT_INTD; 1692 */
1474 bd2->header.int_both = 1; 1693 uv1_hdr->base_dest_nasid =
1475 /* 1694 UV_PNODE_TO_NASID(base_pnode);
1476 * all others need to be set to zero: 1695 uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID;
1477 * fairness chaining multilevel count replied_to 1696 uv1_hdr->command = UV_NET_ENDPOINT_INTD;
1478 */ 1697 uv1_hdr->int_both = 1;
1698 /*
1699 * all others need to be set to zero:
1700 * fairness chaining multilevel count replied_to
1701 */
1702 } else {
1703 uv2_hdr = &bd2->header.uv2_hdr;
1704 uv2_hdr->swack_flag = 1;
1705 uv2_hdr->base_dest_nasid =
1706 UV_PNODE_TO_NASID(base_pnode);
1707 uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID;
1708 uv2_hdr->command = UV_NET_ENDPOINT_INTD;
1709 }
1479 } 1710 }
1480 for_each_present_cpu(cpu) { 1711 for_each_present_cpu(cpu) {
1481 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) 1712 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
@@ -1531,6 +1762,7 @@ static void pq_init(int node, int pnode)
1531 write_mmr_payload_first(pnode, pn_first); 1762 write_mmr_payload_first(pnode, pn_first);
1532 write_mmr_payload_tail(pnode, first); 1763 write_mmr_payload_tail(pnode, first);
1533 write_mmr_payload_last(pnode, last); 1764 write_mmr_payload_last(pnode, last);
1765 write_gmmr_sw_ack(pnode, 0xffffUL);
1534 1766
1535 /* in effect, all msg_type's are set to MSG_NOOP */ 1767 /* in effect, all msg_type's are set to MSG_NOOP */
1536 memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); 1768 memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
@@ -1584,14 +1816,14 @@ static int calculate_destination_timeout(void)
1584 ts_ns = base * mult1 * mult2; 1816 ts_ns = base * mult1 * mult2;
1585 ret = ts_ns / 1000; 1817 ret = ts_ns / 1000;
1586 } else { 1818 } else {
1587 /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */ 1819 /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */
1588 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); 1820 mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
1589 mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; 1821 mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
1590 if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) 1822 if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
1591 mult1 = 80; 1823 base = 80;
1592 else 1824 else
1593 mult1 = 10; 1825 base = 10;
1594 base = mmr_image & UV2_ACK_MASK; 1826 mult1 = mmr_image & UV2_ACK_MASK;
1595 ret = mult1 * base; 1827 ret = mult1 * base;
1596 } 1828 }
1597 return ret; 1829 return ret;
@@ -1618,6 +1850,7 @@ static void __init init_per_cpu_tunables(void)
1618 bcp->cong_response_us = congested_respns_us; 1850 bcp->cong_response_us = congested_respns_us;
1619 bcp->cong_reps = congested_reps; 1851 bcp->cong_reps = congested_reps;
1620 bcp->cong_period = congested_period; 1852 bcp->cong_period = congested_period;
1853 bcp->clocks_per_100_usec = usec_2_cycles(100);
1621 } 1854 }
1622} 1855}
1623 1856
@@ -1728,8 +1961,17 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
1728 bcp->cpus_in_socket = sdp->num_cpus; 1961 bcp->cpus_in_socket = sdp->num_cpus;
1729 bcp->socket_master = *smasterp; 1962 bcp->socket_master = *smasterp;
1730 bcp->uvhub = bdp->uvhub; 1963 bcp->uvhub = bdp->uvhub;
1964 if (is_uv1_hub())
1965 bcp->uvhub_version = 1;
1966 else if (is_uv2_hub())
1967 bcp->uvhub_version = 2;
1968 else {
1969 printk(KERN_EMERG "uvhub version not 1 or 2\n");
1970 return 1;
1971 }
1731 bcp->uvhub_master = *hmasterp; 1972 bcp->uvhub_master = *hmasterp;
1732 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; 1973 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
1974 bcp->using_desc = bcp->uvhub_cpu;
1733 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { 1975 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1734 printk(KERN_EMERG "%d cpus per uvhub invalid\n", 1976 printk(KERN_EMERG "%d cpus per uvhub invalid\n",
1735 bcp->uvhub_cpu); 1977 bcp->uvhub_cpu);
@@ -1845,6 +2087,8 @@ static int __init uv_bau_init(void)
1845 uv_base_pnode = uv_blade_to_pnode(uvhub); 2087 uv_base_pnode = uv_blade_to_pnode(uvhub);
1846 } 2088 }
1847 2089
2090 enable_timeouts();
2091
1848 if (init_per_cpu(nuvhubs, uv_base_pnode)) { 2092 if (init_per_cpu(nuvhubs, uv_base_pnode)) {
1849 nobau = 1; 2093 nobau = 1;
1850 return 0; 2094 return 0;
@@ -1855,7 +2099,6 @@ static int __init uv_bau_init(void)
1855 if (uv_blade_nr_possible_cpus(uvhub)) 2099 if (uv_blade_nr_possible_cpus(uvhub))
1856 init_uvhub(uvhub, vector, uv_base_pnode); 2100 init_uvhub(uvhub, vector, uv_base_pnode);
1857 2101
1858 enable_timeouts();
1859 alloc_intr_gate(vector, uv_bau_message_intr1); 2102 alloc_intr_gate(vector, uv_bau_message_intr1);
1860 2103
1861 for_each_possible_blade(uvhub) { 2104 for_each_possible_blade(uvhub) {
@@ -1867,7 +2110,8 @@ static int __init uv_bau_init(void)
1867 val = 1L << 63; 2110 val = 1L << 63;
1868 write_gmmr_activation(pnode, val); 2111 write_gmmr_activation(pnode, val);
1869 mmr = 1; /* should be 1 to broadcast to both sockets */ 2112 mmr = 1; /* should be 1 to broadcast to both sockets */
1870 write_mmr_data_broadcast(pnode, mmr); 2113 if (!is_uv1_hub())
2114 write_mmr_data_broadcast(pnode, mmr);
1871 } 2115 }
1872 } 2116 }
1873 2117
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 309c70fb7759..5d4ba301e776 100644
--- a/arch/x86/platform/uv/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
@@ -19,7 +19,7 @@
19 * Copyright (c) Russ Anderson 19 * Copyright (c) Russ Anderson
20 */ 20 */
21 21
22#include <linux/sysdev.h> 22#include <linux/device.h>
23#include <asm/uv/bios.h> 23#include <asm/uv/bios.h>
24#include <asm/uv/uv.h> 24#include <asm/uv/uv.h>
25 25
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
new file mode 100644
index 000000000000..564b2476fede
--- /dev/null
+++ b/arch/x86/syscalls/Makefile
@@ -0,0 +1,43 @@
1out := $(obj)/../include/generated/asm
2
3# Create output directory if not already present
4_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
5
6syscall32 := $(srctree)/$(src)/syscall_32.tbl
7syscall64 := $(srctree)/$(src)/syscall_64.tbl
8
9syshdr := $(srctree)/$(src)/syscallhdr.sh
10systbl := $(srctree)/$(src)/syscalltbl.sh
11
12quiet_cmd_syshdr = SYSHDR $@
13 cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \
14 $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget))
15quiet_cmd_systbl = SYSTBL $@
16 cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
17
18syshdr_abi_unistd_32 := i386
19$(out)/unistd_32.h: $(syscall32) $(syshdr)
20 $(call if_changed,syshdr)
21
22syshdr_abi_unistd_32_ia32 := i386
23syshdr_pfx_unistd_32_ia32 := ia32_
24$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
25 $(call if_changed,syshdr)
26
27syshdr_abi_unistd_64 := 64
28$(out)/unistd_64.h: $(syscall64) $(syshdr)
29 $(call if_changed,syshdr)
30
31$(out)/syscalls_32.h: $(syscall32) $(systbl)
32 $(call if_changed,systbl)
33$(out)/syscalls_64.h: $(syscall64) $(systbl)
34 $(call if_changed,systbl)
35
36syshdr-y += unistd_32.h unistd_64.h
37syshdr-y += syscalls_32.h
38syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h
39syshdr-$(CONFIG_X86_64) += syscalls_64.h
40
41targets += $(syshdr-y)
42
43all: $(addprefix $(out)/,$(targets))
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
new file mode 100644
index 000000000000..ce98e287c066
--- /dev/null
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -0,0 +1,357 @@
1#
2# 32-bit system call numbers and entry vectors
3#
4# The format is:
5# <number> <abi> <name> <entry point> <compat entry point>
6#
7# The abi is always "i386" for this file.
8#
90 i386 restart_syscall sys_restart_syscall
101 i386 exit sys_exit
112 i386 fork ptregs_fork stub32_fork
123 i386 read sys_read
134 i386 write sys_write
145 i386 open sys_open compat_sys_open
156 i386 close sys_close
167 i386 waitpid sys_waitpid sys32_waitpid
178 i386 creat sys_creat
189 i386 link sys_link
1910 i386 unlink sys_unlink
2011 i386 execve ptregs_execve stub32_execve
2112 i386 chdir sys_chdir
2213 i386 time sys_time compat_sys_time
2314 i386 mknod sys_mknod
2415 i386 chmod sys_chmod
2516 i386 lchown sys_lchown16
2617 i386 break
2718 i386 oldstat sys_stat
2819 i386 lseek sys_lseek sys32_lseek
2920 i386 getpid sys_getpid
3021 i386 mount sys_mount compat_sys_mount
3122 i386 umount sys_oldumount
3223 i386 setuid sys_setuid16
3324 i386 getuid sys_getuid16
3425 i386 stime sys_stime compat_sys_stime
3526 i386 ptrace sys_ptrace compat_sys_ptrace
3627 i386 alarm sys_alarm
3728 i386 oldfstat sys_fstat
3829 i386 pause sys_pause
3930 i386 utime sys_utime compat_sys_utime
4031 i386 stty
4132 i386 gtty
4233 i386 access sys_access
4334 i386 nice sys_nice
4435 i386 ftime
4536 i386 sync sys_sync
4637 i386 kill sys_kill sys32_kill
4738 i386 rename sys_rename
4839 i386 mkdir sys_mkdir
4940 i386 rmdir sys_rmdir
5041 i386 dup sys_dup
5142 i386 pipe sys_pipe
5243 i386 times sys_times compat_sys_times
5344 i386 prof
5445 i386 brk sys_brk
5546 i386 setgid sys_setgid16
5647 i386 getgid sys_getgid16
5748 i386 signal sys_signal
5849 i386 geteuid sys_geteuid16
5950 i386 getegid sys_getegid16
6051 i386 acct sys_acct
6152 i386 umount2 sys_umount
6253 i386 lock
6354 i386 ioctl sys_ioctl compat_sys_ioctl
6455 i386 fcntl sys_fcntl compat_sys_fcntl64
6556 i386 mpx
6657 i386 setpgid sys_setpgid
6758 i386 ulimit
6859 i386 oldolduname sys_olduname
6960 i386 umask sys_umask
7061 i386 chroot sys_chroot
7162 i386 ustat sys_ustat compat_sys_ustat
7263 i386 dup2 sys_dup2
7364 i386 getppid sys_getppid
7465 i386 getpgrp sys_getpgrp
7566 i386 setsid sys_setsid
7667 i386 sigaction sys_sigaction sys32_sigaction
7768 i386 sgetmask sys_sgetmask
7869 i386 ssetmask sys_ssetmask
7970 i386 setreuid sys_setreuid16
8071 i386 setregid sys_setregid16
8172 i386 sigsuspend sys_sigsuspend sys32_sigsuspend
8273 i386 sigpending sys_sigpending compat_sys_sigpending
8374 i386 sethostname sys_sethostname
8475 i386 setrlimit sys_setrlimit compat_sys_setrlimit
8576 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit
8677 i386 getrusage sys_getrusage compat_sys_getrusage
8778 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday
8879 i386 settimeofday sys_settimeofday compat_sys_settimeofday
8980 i386 getgroups sys_getgroups16
9081 i386 setgroups sys_setgroups16
9182 i386 select sys_old_select compat_sys_old_select
9283 i386 symlink sys_symlink
9384 i386 oldlstat sys_lstat
9485 i386 readlink sys_readlink
9586 i386 uselib sys_uselib
9687 i386 swapon sys_swapon
9788 i386 reboot sys_reboot
9889 i386 readdir sys_old_readdir compat_sys_old_readdir
9990 i386 mmap sys_old_mmap sys32_mmap
10091 i386 munmap sys_munmap
10192 i386 truncate sys_truncate
10293 i386 ftruncate sys_ftruncate
10394 i386 fchmod sys_fchmod
10495 i386 fchown sys_fchown16
10596 i386 getpriority sys_getpriority
10697 i386 setpriority sys_setpriority
10798 i386 profil
10899 i386 statfs sys_statfs compat_sys_statfs
109100 i386 fstatfs sys_fstatfs compat_sys_fstatfs
110101 i386 ioperm sys_ioperm
111102 i386 socketcall sys_socketcall compat_sys_socketcall
112103 i386 syslog sys_syslog
113104 i386 setitimer sys_setitimer compat_sys_setitimer
114105 i386 getitimer sys_getitimer compat_sys_getitimer
115106 i386 stat sys_newstat compat_sys_newstat
116107 i386 lstat sys_newlstat compat_sys_newlstat
117108 i386 fstat sys_newfstat compat_sys_newfstat
118109 i386 olduname sys_uname
119110 i386 iopl ptregs_iopl stub32_iopl
120111 i386 vhangup sys_vhangup
121112 i386 idle
122113 i386 vm86old ptregs_vm86old sys32_vm86_warning
123114 i386 wait4 sys_wait4 compat_sys_wait4
124115 i386 swapoff sys_swapoff
125116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
126117 i386 ipc sys_ipc sys32_ipc
127118 i386 fsync sys_fsync
128119 i386 sigreturn ptregs_sigreturn stub32_sigreturn
129120 i386 clone ptregs_clone stub32_clone
130121 i386 setdomainname sys_setdomainname
131122 i386 uname sys_newuname
132123 i386 modify_ldt sys_modify_ldt
133124 i386 adjtimex sys_adjtimex compat_sys_adjtimex
134125 i386 mprotect sys_mprotect sys32_mprotect
135126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask
136127 i386 create_module
137128 i386 init_module sys_init_module
138129 i386 delete_module sys_delete_module
139130 i386 get_kernel_syms
140131 i386 quotactl sys_quotactl sys32_quotactl
141132 i386 getpgid sys_getpgid
142133 i386 fchdir sys_fchdir
143134 i386 bdflush sys_bdflush
144135 i386 sysfs sys_sysfs
145136 i386 personality sys_personality
146137 i386 afs_syscall
147138 i386 setfsuid sys_setfsuid16
148139 i386 setfsgid sys_setfsgid16
149140 i386 _llseek sys_llseek
150141 i386 getdents sys_getdents compat_sys_getdents
151142 i386 _newselect sys_select compat_sys_select
152143 i386 flock sys_flock
153144 i386 msync sys_msync
154145 i386 readv sys_readv compat_sys_readv
155146 i386 writev sys_writev compat_sys_writev
156147 i386 getsid sys_getsid
157148 i386 fdatasync sys_fdatasync
158149 i386 _sysctl sys_sysctl compat_sys_sysctl
159150 i386 mlock sys_mlock
160151 i386 munlock sys_munlock
161152 i386 mlockall sys_mlockall
162153 i386 munlockall sys_munlockall
163154 i386 sched_setparam sys_sched_setparam
164155 i386 sched_getparam sys_sched_getparam
165156 i386 sched_setscheduler sys_sched_setscheduler
166157 i386 sched_getscheduler sys_sched_getscheduler
167158 i386 sched_yield sys_sched_yield
168159 i386 sched_get_priority_max sys_sched_get_priority_max
169160 i386 sched_get_priority_min sys_sched_get_priority_min
170161 i386 sched_rr_get_interval sys_sched_rr_get_interval sys32_sched_rr_get_interval
171162 i386 nanosleep sys_nanosleep compat_sys_nanosleep
172163 i386 mremap sys_mremap
173164 i386 setresuid sys_setresuid16
174165 i386 getresuid sys_getresuid16
175166 i386 vm86 ptregs_vm86 sys32_vm86_warning
176167 i386 query_module
177168 i386 poll sys_poll
178169 i386 nfsservctl
179170 i386 setresgid sys_setresgid16
180171 i386 getresgid sys_getresgid16
181172 i386 prctl sys_prctl
182173 i386 rt_sigreturn ptregs_rt_sigreturn stub32_rt_sigreturn
183174 i386 rt_sigaction sys_rt_sigaction sys32_rt_sigaction
184175 i386 rt_sigprocmask sys_rt_sigprocmask sys32_rt_sigprocmask
185176 i386 rt_sigpending sys_rt_sigpending sys32_rt_sigpending
186177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait
187178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo sys32_rt_sigqueueinfo
188179 i386 rt_sigsuspend sys_rt_sigsuspend
189180 i386 pread64 sys_pread64 sys32_pread
190181 i386 pwrite64 sys_pwrite64 sys32_pwrite
191182 i386 chown sys_chown16
192183 i386 getcwd sys_getcwd
193184 i386 capget sys_capget
194185 i386 capset sys_capset
195186 i386 sigaltstack ptregs_sigaltstack stub32_sigaltstack
196187 i386 sendfile sys_sendfile sys32_sendfile
197188 i386 getpmsg
198189 i386 putpmsg
199190 i386 vfork ptregs_vfork stub32_vfork
200191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit
201192 i386 mmap2 sys_mmap_pgoff
202193 i386 truncate64 sys_truncate64 sys32_truncate64
203194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64
204195 i386 stat64 sys_stat64 sys32_stat64
205196 i386 lstat64 sys_lstat64 sys32_lstat64
206197 i386 fstat64 sys_fstat64 sys32_fstat64
207198 i386 lchown32 sys_lchown
208199 i386 getuid32 sys_getuid
209200 i386 getgid32 sys_getgid
210201 i386 geteuid32 sys_geteuid
211202 i386 getegid32 sys_getegid
212203 i386 setreuid32 sys_setreuid
213204 i386 setregid32 sys_setregid
214205 i386 getgroups32 sys_getgroups
215206 i386 setgroups32 sys_setgroups
216207 i386 fchown32 sys_fchown
217208 i386 setresuid32 sys_setresuid
218209 i386 getresuid32 sys_getresuid
219210 i386 setresgid32 sys_setresgid
220211 i386 getresgid32 sys_getresgid
221212 i386 chown32 sys_chown
222213 i386 setuid32 sys_setuid
223214 i386 setgid32 sys_setgid
224215 i386 setfsuid32 sys_setfsuid
225216 i386 setfsgid32 sys_setfsgid
226217 i386 pivot_root sys_pivot_root
227218 i386 mincore sys_mincore
228219 i386 madvise sys_madvise
229220 i386 getdents64 sys_getdents64 compat_sys_getdents64
230221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64
231# 222 is unused
232# 223 is unused
233224 i386 gettid sys_gettid
234225 i386 readahead sys_readahead sys32_readahead
235226 i386 setxattr sys_setxattr
236227 i386 lsetxattr sys_lsetxattr
237228 i386 fsetxattr sys_fsetxattr
238229 i386 getxattr sys_getxattr
239230 i386 lgetxattr sys_lgetxattr
240231 i386 fgetxattr sys_fgetxattr
241232 i386 listxattr sys_listxattr
242233 i386 llistxattr sys_llistxattr
243234 i386 flistxattr sys_flistxattr
244235 i386 removexattr sys_removexattr
245236 i386 lremovexattr sys_lremovexattr
246237 i386 fremovexattr sys_fremovexattr
247238 i386 tkill sys_tkill
248239 i386 sendfile64 sys_sendfile64
249240 i386 futex sys_futex compat_sys_futex
250241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity
251242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity
252243 i386 set_thread_area sys_set_thread_area
253244 i386 get_thread_area sys_get_thread_area
254245 i386 io_setup sys_io_setup compat_sys_io_setup
255246 i386 io_destroy sys_io_destroy
256247 i386 io_getevents sys_io_getevents compat_sys_io_getevents
257248 i386 io_submit sys_io_submit compat_sys_io_submit
258249 i386 io_cancel sys_io_cancel
259250 i386 fadvise64 sys_fadvise64 sys32_fadvise64
260# 251 is available for reuse (was briefly sys_set_zone_reclaim)
261252 i386 exit_group sys_exit_group
262253 i386 lookup_dcookie sys_lookup_dcookie sys32_lookup_dcookie
263254 i386 epoll_create sys_epoll_create
264255 i386 epoll_ctl sys_epoll_ctl
265256 i386 epoll_wait sys_epoll_wait
266257 i386 remap_file_pages sys_remap_file_pages
267258 i386 set_tid_address sys_set_tid_address
268259 i386 timer_create sys_timer_create compat_sys_timer_create
269260 i386 timer_settime sys_timer_settime compat_sys_timer_settime
270261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime
271262 i386 timer_getoverrun sys_timer_getoverrun
272263 i386 timer_delete sys_timer_delete
273264 i386 clock_settime sys_clock_settime compat_sys_clock_settime
274265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime
275266 i386 clock_getres sys_clock_getres compat_sys_clock_getres
276267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep
277268 i386 statfs64 sys_statfs64 compat_sys_statfs64
278269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
279270 i386 tgkill sys_tgkill
280271 i386 utimes sys_utimes compat_sys_utimes
281272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64
282273 i386 vserver
283274 i386 mbind sys_mbind
284275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
285276 i386 set_mempolicy sys_set_mempolicy
286277 i386 mq_open sys_mq_open compat_sys_mq_open
287278 i386 mq_unlink sys_mq_unlink
288279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend
289280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive
290281 i386 mq_notify sys_mq_notify compat_sys_mq_notify
291282 i386 mq_getsetaddr sys_mq_getsetattr compat_sys_mq_getsetattr
292283 i386 kexec_load sys_kexec_load compat_sys_kexec_load
293284 i386 waitid sys_waitid compat_sys_waitid
294# 285 sys_setaltroot
295286 i386 add_key sys_add_key
296287 i386 request_key sys_request_key
297288 i386 keyctl sys_keyctl
298289 i386 ioprio_set sys_ioprio_set
299290 i386 ioprio_get sys_ioprio_get
300291 i386 inotify_init sys_inotify_init
301292 i386 inotify_add_watch sys_inotify_add_watch
302293 i386 inotify_rm_watch sys_inotify_rm_watch
303294 i386 migrate_pages sys_migrate_pages
304295 i386 openat sys_openat compat_sys_openat
305296 i386 mkdirat sys_mkdirat
306297 i386 mknodat sys_mknodat
307298 i386 fchownat sys_fchownat
308299 i386 futimesat sys_futimesat compat_sys_futimesat
309300 i386 fstatat64 sys_fstatat64 sys32_fstatat
310301 i386 unlinkat sys_unlinkat
311302 i386 renameat sys_renameat
312303 i386 linkat sys_linkat
313304 i386 symlinkat sys_symlinkat
314305 i386 readlinkat sys_readlinkat
315306 i386 fchmodat sys_fchmodat
316307 i386 faccessat sys_faccessat
317308 i386 pselect6 sys_pselect6 compat_sys_pselect6
318309 i386 ppoll sys_ppoll compat_sys_ppoll
319310 i386 unshare sys_unshare
320311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list
321312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list
322313 i386 splice sys_splice
323314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range
324315 i386 tee sys_tee
325316 i386 vmsplice sys_vmsplice compat_sys_vmsplice
326317 i386 move_pages sys_move_pages compat_sys_move_pages
327318 i386 getcpu sys_getcpu
328319 i386 epoll_pwait sys_epoll_pwait
329320 i386 utimensat sys_utimensat compat_sys_utimensat
330321 i386 signalfd sys_signalfd compat_sys_signalfd
331322 i386 timerfd_create sys_timerfd_create
332323 i386 eventfd sys_eventfd
333324 i386 fallocate sys_fallocate sys32_fallocate
334325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime
335326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime
336327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4
337328 i386 eventfd2 sys_eventfd2
338329 i386 epoll_create1 sys_epoll_create1
339330 i386 dup3 sys_dup3
340331 i386 pipe2 sys_pipe2
341332 i386 inotify_init1 sys_inotify_init1
342333 i386 preadv sys_preadv compat_sys_preadv
343334 i386 pwritev sys_pwritev compat_sys_pwritev
344335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
345336 i386 perf_event_open sys_perf_event_open
346337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg
347338 i386 fanotify_init sys_fanotify_init
348339 i386 fanotify_mark sys_fanotify_mark sys32_fanotify_mark
349340 i386 prlimit64 sys_prlimit64
350341 i386 name_to_handle_at sys_name_to_handle_at
351342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at
352343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime
353344 i386 syncfs sys_syncfs
354345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg
355346 i386 setns sys_setns
356347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
new file mode 100644
index 000000000000..b440a8f7eefa
--- /dev/null
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -0,0 +1,320 @@
1#
2# 64-bit system call numbers and entry vectors
3#
4# The format is:
5# <number> <abi> <name> <entry point>
6#
7# The abi is always "64" for this file (for now.)
8#
90 64 read sys_read
101 64 write sys_write
112 64 open sys_open
123 64 close sys_close
134 64 stat sys_newstat
145 64 fstat sys_newfstat
156 64 lstat sys_newlstat
167 64 poll sys_poll
178 64 lseek sys_lseek
189 64 mmap sys_mmap
1910 64 mprotect sys_mprotect
2011 64 munmap sys_munmap
2112 64 brk sys_brk
2213 64 rt_sigaction sys_rt_sigaction
2314 64 rt_sigprocmask sys_rt_sigprocmask
2415 64 rt_sigreturn stub_rt_sigreturn
2516 64 ioctl sys_ioctl
2617 64 pread64 sys_pread64
2718 64 pwrite64 sys_pwrite64
2819 64 readv sys_readv
2920 64 writev sys_writev
3021 64 access sys_access
3122 64 pipe sys_pipe
3223 64 select sys_select
3324 64 sched_yield sys_sched_yield
3425 64 mremap sys_mremap
3526 64 msync sys_msync
3627 64 mincore sys_mincore
3728 64 madvise sys_madvise
3829 64 shmget sys_shmget
3930 64 shmat sys_shmat
4031 64 shmctl sys_shmctl
4132 64 dup sys_dup
4233 64 dup2 sys_dup2
4334 64 pause sys_pause
4435 64 nanosleep sys_nanosleep
4536 64 getitimer sys_getitimer
4637 64 alarm sys_alarm
4738 64 setitimer sys_setitimer
4839 64 getpid sys_getpid
4940 64 sendfile sys_sendfile64
5041 64 socket sys_socket
5142 64 connect sys_connect
5243 64 accept sys_accept
5344 64 sendto sys_sendto
5445 64 recvfrom sys_recvfrom
5546 64 sendmsg sys_sendmsg
5647 64 recvmsg sys_recvmsg
5748 64 shutdown sys_shutdown
5849 64 bind sys_bind
5950 64 listen sys_listen
6051 64 getsockname sys_getsockname
6152 64 getpeername sys_getpeername
6253 64 socketpair sys_socketpair
6354 64 setsockopt sys_setsockopt
6455 64 getsockopt sys_getsockopt
6556 64 clone stub_clone
6657 64 fork stub_fork
6758 64 vfork stub_vfork
6859 64 execve stub_execve
6960 64 exit sys_exit
7061 64 wait4 sys_wait4
7162 64 kill sys_kill
7263 64 uname sys_newuname
7364 64 semget sys_semget
7465 64 semop sys_semop
7566 64 semctl sys_semctl
7667 64 shmdt sys_shmdt
7768 64 msgget sys_msgget
7869 64 msgsnd sys_msgsnd
7970 64 msgrcv sys_msgrcv
8071 64 msgctl sys_msgctl
8172 64 fcntl sys_fcntl
8273 64 flock sys_flock
8374 64 fsync sys_fsync
8475 64 fdatasync sys_fdatasync
8576 64 truncate sys_truncate
8677 64 ftruncate sys_ftruncate
8778 64 getdents sys_getdents
8879 64 getcwd sys_getcwd
8980 64 chdir sys_chdir
9081 64 fchdir sys_fchdir
9182 64 rename sys_rename
9283 64 mkdir sys_mkdir
9384 64 rmdir sys_rmdir
9485 64 creat sys_creat
9586 64 link sys_link
9687 64 unlink sys_unlink
9788 64 symlink sys_symlink
9889 64 readlink sys_readlink
9990 64 chmod sys_chmod
10091 64 fchmod sys_fchmod
10192 64 chown sys_chown
10293 64 fchown sys_fchown
10394 64 lchown sys_lchown
10495 64 umask sys_umask
10596 64 gettimeofday sys_gettimeofday
10697 64 getrlimit sys_getrlimit
10798 64 getrusage sys_getrusage
10899 64 sysinfo sys_sysinfo
109100 64 times sys_times
110101 64 ptrace sys_ptrace
111102 64 getuid sys_getuid
112103 64 syslog sys_syslog
113104 64 getgid sys_getgid
114105 64 setuid sys_setuid
115106 64 setgid sys_setgid
116107 64 geteuid sys_geteuid
117108 64 getegid sys_getegid
118109 64 setpgid sys_setpgid
119110 64 getppid sys_getppid
120111 64 getpgrp sys_getpgrp
121112 64 setsid sys_setsid
122113 64 setreuid sys_setreuid
123114 64 setregid sys_setregid
124115 64 getgroups sys_getgroups
125116 64 setgroups sys_setgroups
126117 64 setresuid sys_setresuid
127118 64 getresuid sys_getresuid
128119 64 setresgid sys_setresgid
129120 64 getresgid sys_getresgid
130121 64 getpgid sys_getpgid
131122 64 setfsuid sys_setfsuid
132123 64 setfsgid sys_setfsgid
133124 64 getsid sys_getsid
134125 64 capget sys_capget
135126 64 capset sys_capset
136127 64 rt_sigpending sys_rt_sigpending
137128 64 rt_sigtimedwait sys_rt_sigtimedwait
138129 64 rt_sigqueueinfo sys_rt_sigqueueinfo
139130 64 rt_sigsuspend sys_rt_sigsuspend
140131 64 sigaltstack stub_sigaltstack
141132 64 utime sys_utime
142133 64 mknod sys_mknod
143134 64 uselib
144135 64 personality sys_personality
145136 64 ustat sys_ustat
146137 64 statfs sys_statfs
147138 64 fstatfs sys_fstatfs
148139 64 sysfs sys_sysfs
149140 64 getpriority sys_getpriority
150141 64 setpriority sys_setpriority
151142 64 sched_setparam sys_sched_setparam
152143 64 sched_getparam sys_sched_getparam
153144 64 sched_setscheduler sys_sched_setscheduler
154145 64 sched_getscheduler sys_sched_getscheduler
155146 64 sched_get_priority_max sys_sched_get_priority_max
156147 64 sched_get_priority_min sys_sched_get_priority_min
157148 64 sched_rr_get_interval sys_sched_rr_get_interval
158149 64 mlock sys_mlock
159150 64 munlock sys_munlock
160151 64 mlockall sys_mlockall
161152 64 munlockall sys_munlockall
162153 64 vhangup sys_vhangup
163154 64 modify_ldt sys_modify_ldt
164155 64 pivot_root sys_pivot_root
165156 64 _sysctl sys_sysctl
166157 64 prctl sys_prctl
167158 64 arch_prctl sys_arch_prctl
168159 64 adjtimex sys_adjtimex
169160 64 setrlimit sys_setrlimit
170161 64 chroot sys_chroot
171162 64 sync sys_sync
172163 64 acct sys_acct
173164 64 settimeofday sys_settimeofday
174165 64 mount sys_mount
175166 64 umount2 sys_umount
176167 64 swapon sys_swapon
177168 64 swapoff sys_swapoff
178169 64 reboot sys_reboot
179170 64 sethostname sys_sethostname
180171 64 setdomainname sys_setdomainname
181172 64 iopl stub_iopl
182173 64 ioperm sys_ioperm
183174 64 create_module
184175 64 init_module sys_init_module
185176 64 delete_module sys_delete_module
186177 64 get_kernel_syms
187178 64 query_module
188179 64 quotactl sys_quotactl
189180 64 nfsservctl
190181 64 getpmsg
191182 64 putpmsg
192183 64 afs_syscall
193184 64 tuxcall
194185 64 security
195186 64 gettid sys_gettid
196187 64 readahead sys_readahead
197188 64 setxattr sys_setxattr
198189 64 lsetxattr sys_lsetxattr
199190 64 fsetxattr sys_fsetxattr
200191 64 getxattr sys_getxattr
201192 64 lgetxattr sys_lgetxattr
202193 64 fgetxattr sys_fgetxattr
203194 64 listxattr sys_listxattr
204195 64 llistxattr sys_llistxattr
205196 64 flistxattr sys_flistxattr
206197 64 removexattr sys_removexattr
207198 64 lremovexattr sys_lremovexattr
208199 64 fremovexattr sys_fremovexattr
209200 64 tkill sys_tkill
210201 64 time sys_time
211202 64 futex sys_futex
212203 64 sched_setaffinity sys_sched_setaffinity
213204 64 sched_getaffinity sys_sched_getaffinity
214205 64 set_thread_area
215206 64 io_setup sys_io_setup
216207 64 io_destroy sys_io_destroy
217208 64 io_getevents sys_io_getevents
218209 64 io_submit sys_io_submit
219210 64 io_cancel sys_io_cancel
220211 64 get_thread_area
221212 64 lookup_dcookie sys_lookup_dcookie
222213 64 epoll_create sys_epoll_create
223214 64 epoll_ctl_old
224215 64 epoll_wait_old
225216 64 remap_file_pages sys_remap_file_pages
226217 64 getdents64 sys_getdents64
227218 64 set_tid_address sys_set_tid_address
228219 64 restart_syscall sys_restart_syscall
229220 64 semtimedop sys_semtimedop
230221 64 fadvise64 sys_fadvise64
231222 64 timer_create sys_timer_create
232223 64 timer_settime sys_timer_settime
233224 64 timer_gettime sys_timer_gettime
234225 64 timer_getoverrun sys_timer_getoverrun
235226 64 timer_delete sys_timer_delete
236227 64 clock_settime sys_clock_settime
237228 64 clock_gettime sys_clock_gettime
238229 64 clock_getres sys_clock_getres
239230 64 clock_nanosleep sys_clock_nanosleep
240231 64 exit_group sys_exit_group
241232 64 epoll_wait sys_epoll_wait
242233 64 epoll_ctl sys_epoll_ctl
243234 64 tgkill sys_tgkill
244235 64 utimes sys_utimes
245236 64 vserver
246237 64 mbind sys_mbind
247238 64 set_mempolicy sys_set_mempolicy
248239 64 get_mempolicy sys_get_mempolicy
249240 64 mq_open sys_mq_open
250241 64 mq_unlink sys_mq_unlink
251242 64 mq_timedsend sys_mq_timedsend
252243 64 mq_timedreceive sys_mq_timedreceive
253244 64 mq_notify sys_mq_notify
254245 64 mq_getsetattr sys_mq_getsetattr
255246 64 kexec_load sys_kexec_load
256247 64 waitid sys_waitid
257248 64 add_key sys_add_key
258249 64 request_key sys_request_key
259250 64 keyctl sys_keyctl
260251 64 ioprio_set sys_ioprio_set
261252 64 ioprio_get sys_ioprio_get
262253 64 inotify_init sys_inotify_init
263254 64 inotify_add_watch sys_inotify_add_watch
264255 64 inotify_rm_watch sys_inotify_rm_watch
265256 64 migrate_pages sys_migrate_pages
266257 64 openat sys_openat
267258 64 mkdirat sys_mkdirat
268259 64 mknodat sys_mknodat
269260 64 fchownat sys_fchownat
270261 64 futimesat sys_futimesat
271262 64 newfstatat sys_newfstatat
272263 64 unlinkat sys_unlinkat
273264 64 renameat sys_renameat
274265 64 linkat sys_linkat
275266 64 symlinkat sys_symlinkat
276267 64 readlinkat sys_readlinkat
277268 64 fchmodat sys_fchmodat
278269 64 faccessat sys_faccessat
279270 64 pselect6 sys_pselect6
280271 64 ppoll sys_ppoll
281272 64 unshare sys_unshare
282273 64 set_robust_list sys_set_robust_list
283274 64 get_robust_list sys_get_robust_list
284275 64 splice sys_splice
285276 64 tee sys_tee
286277 64 sync_file_range sys_sync_file_range
287278 64 vmsplice sys_vmsplice
288279 64 move_pages sys_move_pages
289280 64 utimensat sys_utimensat
290281 64 epoll_pwait sys_epoll_pwait
291282 64 signalfd sys_signalfd
292283 64 timerfd_create sys_timerfd_create
293284 64 eventfd sys_eventfd
294285 64 fallocate sys_fallocate
295286 64 timerfd_settime sys_timerfd_settime
296287 64 timerfd_gettime sys_timerfd_gettime
297288 64 accept4 sys_accept4
298289 64 signalfd4 sys_signalfd4
299290 64 eventfd2 sys_eventfd2
300291 64 epoll_create1 sys_epoll_create1
301292 64 dup3 sys_dup3
302293 64 pipe2 sys_pipe2
303294 64 inotify_init1 sys_inotify_init1
304295 64 preadv sys_preadv
305296 64 pwritev sys_pwritev
306297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo
307298 64 perf_event_open sys_perf_event_open
308299 64 recvmmsg sys_recvmmsg
309300 64 fanotify_init sys_fanotify_init
310301 64 fanotify_mark sys_fanotify_mark
311302 64 prlimit64 sys_prlimit64
312303 64 name_to_handle_at sys_name_to_handle_at
313304 64 open_by_handle_at sys_open_by_handle_at
314305 64 clock_adjtime sys_clock_adjtime
315306 64 syncfs sys_syncfs
316307 64 sendmmsg sys_sendmmsg
317308 64 setns sys_setns
318309 64 getcpu sys_getcpu
319310 64 process_vm_readv sys_process_vm_readv
320311 64 process_vm_writev sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
new file mode 100644
index 000000000000..31fd5f1f38f7
--- /dev/null
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -0,0 +1,27 @@
1#!/bin/sh
2
3in="$1"
4out="$2"
5my_abis=`echo "($3)" | tr ',' '|'`
6prefix="$4"
7offset="$5"
8
9fileguard=_ASM_X86_`basename "$out" | sed \
10 -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
11 -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
12grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
13 echo "#ifndef ${fileguard}"
14 echo "#define ${fileguard} 1"
15 echo ""
16
17 while read nr abi name entry ; do
18 if [ -z "$offset" ]; then
19 echo "#define __NR_${prefix}${name} $nr"
20 else
21 echo "#define __NR_${prefix}${name} ($offset + $nr)"
22 fi
23 done
24
25 echo ""
26 echo "#endif /* ${fileguard} */"
27) > "$out"
diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh
new file mode 100644
index 000000000000..0e7f8ec071e7
--- /dev/null
+++ b/arch/x86/syscalls/syscalltbl.sh
@@ -0,0 +1,15 @@
1#!/bin/sh
2
3in="$1"
4out="$2"
5
6grep '^[0-9]' "$in" | sort -n | (
7 while read nr abi name entry compat; do
8 abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
9 if [ -n "$compat" ]; then
10 echo "__SYSCALL_${abi}($nr, $entry, $compat)"
11 elif [ -n "$entry" ]; then
12 echo "__SYSCALL_${abi}($nr, $entry, $entry)"
13 fi
14 done
15) > "$out"
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index f82082677337..d511aa97533a 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -18,14 +18,21 @@ chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
18quiet_cmd_posttest = TEST $@ 18quiet_cmd_posttest = TEST $@
19 cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) 19 cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
20 20
21posttest: $(obj)/test_get_len vmlinux 21quiet_cmd_sanitytest = TEST $@
22 cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000
23
24posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity
22 $(call cmd,posttest) 25 $(call cmd,posttest)
26 $(call cmd,sanitytest)
23 27
24hostprogs-y := test_get_len 28hostprogs-y += test_get_len insn_sanity
25 29
26# -I needed for generated C source and C source which in the kernel tree. 30# -I needed for generated C source and C source which in the kernel tree.
27HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ 31HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
28 32
33HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
34
29# Dependencies are also needed. 35# Dependencies are also needed.
30$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c 36$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
31 37
38$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index eaf11f52fc0b..5f6a5b6c3a15 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -47,7 +47,7 @@ BEGIN {
47 sep_expr = "^\\|$" 47 sep_expr = "^\\|$"
48 group_expr = "^Grp[0-9A-Za-z]+" 48 group_expr = "^Grp[0-9A-Za-z]+"
49 49
50 imm_expr = "^[IJAO][a-z]" 50 imm_expr = "^[IJAOL][a-z]"
51 imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" 51 imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
52 imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" 52 imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
53 imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" 53 imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
@@ -59,6 +59,7 @@ BEGIN {
59 imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" 59 imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
60 imm_flag["Ob"] = "INAT_MOFFSET" 60 imm_flag["Ob"] = "INAT_MOFFSET"
61 imm_flag["Ov"] = "INAT_MOFFSET" 61 imm_flag["Ov"] = "INAT_MOFFSET"
62 imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
62 63
63 modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" 64 modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
64 force64_expr = "\\([df]64\\)" 65 force64_expr = "\\([df]64\\)"
@@ -70,8 +71,12 @@ BEGIN {
70 lprefix3_expr = "\\(F2\\)" 71 lprefix3_expr = "\\(F2\\)"
71 max_lprefix = 4 72 max_lprefix = 4
72 73
73 vexok_expr = "\\(VEX\\)" 74 # All opcodes starting with lower-case 'v' or with (v1) superscript
74 vexonly_expr = "\\(oVEX\\)" 75 # accepts VEX prefix
76 vexok_opcode_expr = "^v.*"
77 vexok_expr = "\\(v1\\)"
78 # All opcodes with (v) superscript supports *only* VEX prefix
79 vexonly_expr = "\\(v\\)"
75 80
76 prefix_expr = "\\(Prefix\\)" 81 prefix_expr = "\\(Prefix\\)"
77 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" 82 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
@@ -85,8 +90,8 @@ BEGIN {
85 prefix_num["SEG=GS"] = "INAT_PFX_GS" 90 prefix_num["SEG=GS"] = "INAT_PFX_GS"
86 prefix_num["SEG=SS"] = "INAT_PFX_SS" 91 prefix_num["SEG=SS"] = "INAT_PFX_SS"
87 prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" 92 prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
88 prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2" 93 prefix_num["VEX+1byte"] = "INAT_PFX_VEX2"
89 prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3" 94 prefix_num["VEX+2byte"] = "INAT_PFX_VEX3"
90 95
91 clear_vars() 96 clear_vars()
92} 97}
@@ -310,12 +315,10 @@ function convert_operands(count,opnd, i,j,imm,mod)
310 if (match(opcode, fpu_expr)) 315 if (match(opcode, fpu_expr))
311 flags = add_flags(flags, "INAT_MODRM") 316 flags = add_flags(flags, "INAT_MODRM")
312 317
313 # check VEX only code 318 # check VEX codes
314 if (match(ext, vexonly_expr)) 319 if (match(ext, vexonly_expr))
315 flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") 320 flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
316 321 else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr))
317 # check VEX only code
318 if (match(ext, vexok_expr))
319 flags = add_flags(flags, "INAT_VEXOK") 322 flags = add_flags(flags, "INAT_VEXOK")
320 323
321 # check prefixes 324 # check prefixes
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
new file mode 100644
index 000000000000..cc2f8c131286
--- /dev/null
+++ b/arch/x86/tools/insn_sanity.c
@@ -0,0 +1,275 @@
1/*
2 * x86 decoder sanity test - based on test_get_insn.c
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 * Copyright (C) Hitachi, Ltd., 2011
20 */
21
22#include <stdlib.h>
23#include <stdio.h>
24#include <string.h>
25#include <assert.h>
26#include <unistd.h>
27#include <sys/types.h>
28#include <sys/stat.h>
29#include <fcntl.h>
30
31#define unlikely(cond) (cond)
32#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
33
34#include <asm/insn.h>
35#include <inat.c>
36#include <insn.c>
37
38/*
39 * Test of instruction analysis against tampering.
40 * Feed random binary to instruction decoder and ensure not to
41 * access out-of-instruction-buffer.
42 */
43
44#define DEFAULT_MAX_ITER 10000
45#define INSN_NOP 0x90
46
47static const char *prog; /* Program name */
48static int verbose; /* Verbosity */
49static int x86_64; /* x86-64 bit mode flag */
50static unsigned int seed; /* Random seed */
51static unsigned long iter_start; /* Start of iteration number */
52static unsigned long iter_end = DEFAULT_MAX_ITER; /* End of iteration number */
53static FILE *input_file; /* Input file name */
54
55static void usage(const char *err)
56{
57 if (err)
58 fprintf(stderr, "Error: %s\n\n", err);
59 fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog);
60 fprintf(stderr, "\t-y 64bit mode\n");
61 fprintf(stderr, "\t-n 32bit mode\n");
62 fprintf(stderr, "\t-v Verbosity(-vv dumps any decoded result)\n");
63 fprintf(stderr, "\t-s Give a random seed (and iteration number)\n");
64 fprintf(stderr, "\t-m Give a maximum iteration number\n");
65 fprintf(stderr, "\t-i Give an input file with decoded binary\n");
66 exit(1);
67}
68
69static void dump_field(FILE *fp, const char *name, const char *indent,
70 struct insn_field *field)
71{
72 fprintf(fp, "%s.%s = {\n", indent, name);
73 fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
74 indent, field->value, field->bytes[0], field->bytes[1],
75 field->bytes[2], field->bytes[3]);
76 fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
77 field->got, field->nbytes);
78}
79
80static void dump_insn(FILE *fp, struct insn *insn)
81{
82 fprintf(fp, "Instruction = {\n");
83 dump_field(fp, "prefixes", "\t", &insn->prefixes);
84 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
85 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
86 dump_field(fp, "opcode", "\t", &insn->opcode);
87 dump_field(fp, "modrm", "\t", &insn->modrm);
88 dump_field(fp, "sib", "\t", &insn->sib);
89 dump_field(fp, "displacement", "\t", &insn->displacement);
90 dump_field(fp, "immediate1", "\t", &insn->immediate1);
91 dump_field(fp, "immediate2", "\t", &insn->immediate2);
92 fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
93 insn->attr, insn->opnd_bytes, insn->addr_bytes);
94 fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
95 insn->length, insn->x86_64, insn->kaddr);
96}
97
98static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter,
99 unsigned char *insn_buf, struct insn *insn)
100{
101 int i;
102
103 fprintf(fp, "%s:\n", msg);
104
105 dump_insn(fp, insn);
106
107 fprintf(fp, "You can reproduce this with below command(s);\n");
108
109 /* Input a decoded instruction sequence directly */
110 fprintf(fp, " $ echo ");
111 for (i = 0; i < MAX_INSN_SIZE; i++)
112 fprintf(fp, " %02x", insn_buf[i]);
113 fprintf(fp, " | %s -i -\n", prog);
114
115 if (!input_file) {
116 fprintf(fp, "Or \n");
117 /* Give a seed and iteration number */
118 fprintf(fp, " $ %s -s 0x%x,%lu\n", prog, seed, nr_iter);
119 }
120}
121
122static void init_random_seed(void)
123{
124 int fd;
125
126 fd = open("/dev/urandom", O_RDONLY);
127 if (fd < 0)
128 goto fail;
129
130 if (read(fd, &seed, sizeof(seed)) != sizeof(seed))
131 goto fail;
132
133 close(fd);
134 return;
135fail:
136 usage("Failed to open /dev/urandom");
137}
138
139/* Read given instruction sequence from the input file */
140static int read_next_insn(unsigned char *insn_buf)
141{
142 char buf[256] = "", *tmp;
143 int i;
144
145 tmp = fgets(buf, ARRAY_SIZE(buf), input_file);
146 if (tmp == NULL || feof(input_file))
147 return 0;
148
149 for (i = 0; i < MAX_INSN_SIZE; i++) {
150 insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16);
151 if (*tmp != ' ')
152 break;
153 }
154
155 return i;
156}
157
158static int generate_insn(unsigned char *insn_buf)
159{
160 int i;
161
162 if (input_file)
163 return read_next_insn(insn_buf);
164
165 /* Fills buffer with random binary up to MAX_INSN_SIZE */
166 for (i = 0; i < MAX_INSN_SIZE - 1; i += 2)
167 *(unsigned short *)(&insn_buf[i]) = random() & 0xffff;
168
169 while (i < MAX_INSN_SIZE)
170 insn_buf[i++] = random() & 0xff;
171
172 return i;
173}
174
175static void parse_args(int argc, char **argv)
176{
177 int c;
178 char *tmp = NULL;
179 int set_seed = 0;
180
181 prog = argv[0];
182 while ((c = getopt(argc, argv, "ynvs:m:i:")) != -1) {
183 switch (c) {
184 case 'y':
185 x86_64 = 1;
186 break;
187 case 'n':
188 x86_64 = 0;
189 break;
190 case 'v':
191 verbose++;
192 break;
193 case 'i':
194 if (strcmp("-", optarg) == 0)
195 input_file = stdin;
196 else
197 input_file = fopen(optarg, "r");
198 if (!input_file)
199 usage("Failed to open input file");
200 break;
201 case 's':
202 seed = (unsigned int)strtoul(optarg, &tmp, 0);
203 if (*tmp == ',') {
204 optarg = tmp + 1;
205 iter_start = strtoul(optarg, &tmp, 0);
206 }
207 if (*tmp != '\0' || tmp == optarg)
208 usage("Failed to parse seed");
209 set_seed = 1;
210 break;
211 case 'm':
212 iter_end = strtoul(optarg, &tmp, 0);
213 if (*tmp != '\0' || tmp == optarg)
214 usage("Failed to parse max_iter");
215 break;
216 default:
217 usage(NULL);
218 }
219 }
220
221 /* Check errors */
222 if (iter_end < iter_start)
223 usage("Max iteration number must be bigger than iter-num");
224
225 if (set_seed && input_file)
226 usage("Don't use input file (-i) with random seed (-s)");
227
228 /* Initialize random seed */
229 if (!input_file) {
230 if (!set_seed) /* No seed is given */
231 init_random_seed();
232 srand(seed);
233 }
234}
235
236int main(int argc, char **argv)
237{
238 struct insn insn;
239 int insns = 0;
240 int errors = 0;
241 unsigned long i;
242 unsigned char insn_buf[MAX_INSN_SIZE * 2];
243
244 parse_args(argc, argv);
245
246 /* Prepare stop bytes with NOPs */
247 memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE);
248
249 for (i = 0; i < iter_end; i++) {
250 if (generate_insn(insn_buf) <= 0)
251 break;
252
253 if (i < iter_start) /* Skip to given iteration number */
254 continue;
255
256 /* Decode an instruction */
257 insn_init(&insn, insn_buf, x86_64);
258 insn_get_length(&insn);
259
260 if (insn.next_byte <= insn.kaddr ||
261 insn.kaddr + MAX_INSN_SIZE < insn.next_byte) {
262 /* Access out-of-range memory */
263 dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn);
264 errors++;
265 } else if (verbose && !insn_complete(&insn))
266 dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn);
267 else if (verbose >= 2)
268 dump_insn(stdout, &insn);
269 insns++;
270 }
271
272 fprintf(stdout, "%s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", (errors) ? "Failure" : "Success", insns, (input_file) ? "given" : "random", errors, seed);
273
274 return errors ? 1 : 0;
275}
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 1d97bd84b6fb..b2b54d2edf53 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,14 +6,6 @@ menu "UML-specific options"
6 6
7menu "Host processor type and features" 7menu "Host processor type and features"
8 8
9config CMPXCHG_LOCAL
10 bool
11 default n
12
13config CMPXCHG_DOUBLE
14 bool
15 default n
16
17source "arch/x86/Kconfig.cpu" 9source "arch/x86/Kconfig.cpu"
18 10
19endmenu 11endmenu
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 8fb58400e415..5d065b2222d3 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -37,7 +37,8 @@ subarch-$(CONFIG_MODULES) += ../kernel/module.o
37USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o 37USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o
38 38
39extra-y += user-offsets.s 39extra-y += user-offsets.s
40$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) 40$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
41 -Iarch/x86/include/generated
41 42
42UNPROFILE_OBJS := stub_segv.o 43UNPROFILE_OBJS := stub_segv.o
43CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) 44CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING)
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 711b1621747f..2bbe1ec2d96a 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -1,5 +1,15 @@
1#ifndef __SYSDEP_X86_PTRACE_H
2#define __SYSDEP_X86_PTRACE_H
3
1#ifdef __i386__ 4#ifdef __i386__
2#include "ptrace_32.h" 5#include "ptrace_32.h"
3#else 6#else
4#include "ptrace_64.h" 7#include "ptrace_64.h"
5#endif 8#endif
9
10static inline long regs_return_value(struct uml_pt_regs *regs)
11{
12 return UPT_SYSCALL_RET(regs);
13}
14
15#endif /* __SYSDEP_X86_PTRACE_H */
diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S
deleted file mode 100644
index a7ca80d2dceb..000000000000
--- a/arch/x86/um/sys_call_table_32.S
+++ /dev/null
@@ -1,26 +0,0 @@
1#include <linux/linkage.h>
2/* Steal i386 syscall table for our purposes, but with some slight changes.*/
3
4#define sys_iopl sys_ni_syscall
5#define sys_ioperm sys_ni_syscall
6
7#define sys_vm86old sys_ni_syscall
8#define sys_vm86 sys_ni_syscall
9
10#define old_mmap sys_old_mmap
11
12#define ptregs_fork sys_fork
13#define ptregs_execve sys_execve
14#define ptregs_iopl sys_iopl
15#define ptregs_vm86old sys_vm86old
16#define ptregs_clone sys_clone
17#define ptregs_vm86 sys_vm86
18#define ptregs_sigaltstack sys_sigaltstack
19#define ptregs_vfork sys_vfork
20
21.section .rodata,"a"
22
23#include "../kernel/syscall_table_32.S"
24
25ENTRY(syscall_table_size)
26.long .-sys_call_table
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
new file mode 100644
index 000000000000..416bd40c0eba
--- /dev/null
+++ b/arch/x86/um/sys_call_table_32.c
@@ -0,0 +1,55 @@
1/*
2 * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c
3 * with some changes for UML.
4 */
5
6#include <linux/linkage.h>
7#include <linux/sys.h>
8#include <linux/cache.h>
9#include <generated/user_constants.h>
10
11#define __NO_STUBS
12
13/*
14 * Below you can see, in terms of #define's, the differences between the x86-64
15 * and the UML syscall table.
16 */
17
18/* Not going to be implemented by UML, since we have no hardware. */
19#define sys_iopl sys_ni_syscall
20#define sys_ioperm sys_ni_syscall
21
22#define sys_vm86old sys_ni_syscall
23#define sys_vm86 sys_ni_syscall
24
25#define old_mmap sys_old_mmap
26
27#define ptregs_fork sys_fork
28#define ptregs_execve sys_execve
29#define ptregs_iopl sys_iopl
30#define ptregs_vm86old sys_vm86old
31#define ptregs_clone sys_clone
32#define ptregs_vm86 sys_vm86
33#define ptregs_sigaltstack sys_sigaltstack
34#define ptregs_vfork sys_vfork
35
36#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
37#include <asm/syscalls_32.h>
38
39#undef __SYSCALL_I386
40#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
41
42typedef void (*sys_call_ptr_t)(void);
43
44extern void sys_ni_syscall(void);
45
46const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
47 /*
48 * Smells like a compiler bug -- it doesn't work
49 * when the & below is removed.
50 */
51 [0 ... __NR_syscall_max] = &sys_ni_syscall,
52#include <asm/syscalls_32.h>
53};
54
55int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 99522f78b162..fe626c3ba01b 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -1,11 +1,12 @@
1/* 1/*
2 * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c 2 * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c
3 * with some changes for UML. 3 * with some changes for UML.
4 */ 4 */
5 5
6#include <linux/linkage.h> 6#include <linux/linkage.h>
7#include <linux/sys.h> 7#include <linux/sys.h>
8#include <linux/cache.h> 8#include <linux/cache.h>
9#include <generated/user_constants.h>
9 10
10#define __NO_STUBS 11#define __NO_STUBS
11 12
@@ -34,31 +35,23 @@
34#define stub_sigaltstack sys_sigaltstack 35#define stub_sigaltstack sys_sigaltstack
35#define stub_rt_sigreturn sys_rt_sigreturn 36#define stub_rt_sigreturn sys_rt_sigreturn
36 37
37#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 38#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
38#undef _ASM_X86_UNISTD_64_H 39#include <asm/syscalls_64.h>
39#include "../../x86/include/asm/unistd_64.h"
40 40
41#undef __SYSCALL 41#undef __SYSCALL_64
42#define __SYSCALL(nr, sym) [ nr ] = sym, 42#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
43#undef _ASM_X86_UNISTD_64_H
44 43
45typedef void (*sys_call_ptr_t)(void); 44typedef void (*sys_call_ptr_t)(void);
46 45
47extern void sys_ni_syscall(void); 46extern void sys_ni_syscall(void);
48 47
49/* 48const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
50 * We used to have a trick here which made sure that holes in the 49 /*
51 * x86_64 table were filled in with sys_ni_syscall, but a comment in 50 * Smells like a compiler bug -- it doesn't work
52 * unistd_64.h says that holes aren't allowed, so the trick was 51 * when the & below is removed.
53 * removed. 52 */
54 * The trick looked like this 53 [0 ... __NR_syscall_max] = &sys_ni_syscall,
55 * [0 ... UM_NR_syscall_max] = &sys_ni_syscall 54#include <asm/syscalls_64.h>
56 * before including unistd_64.h - the later initializations overwrote
57 * the sys_ni_syscall filler.
58 */
59
60sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
61#include <asm/unistd_64.h>
62}; 55};
63 56
64int syscall_table_size = sizeof(sys_call_table); 57int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ca49be8ddd0c..5edf4f4bbf53 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -8,6 +8,18 @@
8#include <asm/ptrace.h> 8#include <asm/ptrace.h>
9#include <asm/types.h> 9#include <asm/types.h>
10 10
11#ifdef __i386__
12#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
13static char syscalls[] = {
14#include <asm/syscalls_32.h>
15};
16#else
17#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
18static char syscalls[] = {
19#include <asm/syscalls_64.h>
20};
21#endif
22
11#define DEFINE(sym, val) \ 23#define DEFINE(sym, val) \
12 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 24 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
13 25
@@ -77,4 +89,7 @@ void foo(void)
77 DEFINE(UM_PROT_READ, PROT_READ); 89 DEFINE(UM_PROT_READ, PROT_READ);
78 DEFINE(UM_PROT_WRITE, PROT_WRITE); 90 DEFINE(UM_PROT_WRITE, PROT_WRITE);
79 DEFINE(UM_PROT_EXEC, PROT_EXEC); 91 DEFINE(UM_PROT_EXEC, PROT_EXEC);
92
93 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
94 DEFINE(NR_syscalls, sizeof(syscalls));
80} 95}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 26c731a106af..fdce49c7aff6 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -29,7 +29,8 @@ config XEN_PVHVM
29 29
30config XEN_MAX_DOMAIN_MEMORY 30config XEN_MAX_DOMAIN_MEMORY
31 int 31 int
32 default 128 32 default 500 if X86_64
33 default 64 if X86_32
33 depends on XEN 34 depends on XEN
34 help 35 help
35 This only affects the sizing of some bss arrays, the unused 36 This only affects the sizing of some bss arrays, the unused
@@ -48,3 +49,4 @@ config XEN_DEBUG_FS
48 help 49 help
49 Enable statistics output and various tuning options in debugfs. 50 Enable statistics output and various tuning options in debugfs.
50 Enabling this option may incur a significant performance overhead. 51 Enabling this option may incur a significant performance overhead.
52
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 7c0fedd98ea0..ef1db1900d86 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -109,7 +109,7 @@ static const struct file_operations u32_array_fops = {
109 .llseek = no_llseek, 109 .llseek = no_llseek,
110}; 110};
111 111
112struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, 112struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
113 struct dentry *parent, 113 struct dentry *parent,
114 u32 *array, unsigned elements) 114 u32 *array, unsigned elements)
115{ 115{
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index e28132084832..78d25499be5b 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,7 +3,7 @@
3 3
4struct dentry * __init xen_init_debugfs(void); 4struct dentry * __init xen_init_debugfs(void);
5 5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, 6struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
7 struct dentry *parent, 7 struct dentry *parent,
8 u32 *array, unsigned elements); 8 u32 *array, unsigned elements);
9 9
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1f928659c338..12eb07bfb267 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1215,8 +1215,6 @@ asmlinkage void __init xen_start_kernel(void)
1215 local_irq_disable(); 1215 local_irq_disable();
1216 early_boot_irqs_disabled = true; 1216 early_boot_irqs_disabled = true;
1217 1217
1218 memblock_init();
1219
1220 xen_raw_console_write("mapping kernel into physical memory\n"); 1218 xen_raw_console_write("mapping kernel into physical memory\n");
1221 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1219 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1222 xen_ident_map_ISA(); 1220 xen_ident_map_ISA();
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 5a40d24ba331..3a5f55d51907 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -54,6 +54,20 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page,
54 return 0; 54 return 0;
55} 55}
56 56
57/*
58 * This function is used to map shared frames to store grant status. It is
59 * different from map_pte_fn above, the frames type here is uint64_t.
60 */
61static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
62 unsigned long addr, void *data)
63{
64 uint64_t **frames = (uint64_t **)data;
65
66 set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
67 (*frames)++;
68 return 0;
69}
70
57static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, 71static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
58 unsigned long addr, void *data) 72 unsigned long addr, void *data)
59{ 73{
@@ -64,10 +78,10 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
64 78
65int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, 79int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
66 unsigned long max_nr_gframes, 80 unsigned long max_nr_gframes,
67 struct grant_entry **__shared) 81 void **__shared)
68{ 82{
69 int rc; 83 int rc;
70 struct grant_entry *shared = *__shared; 84 void *shared = *__shared;
71 85
72 if (shared == NULL) { 86 if (shared == NULL) {
73 struct vm_struct *area = 87 struct vm_struct *area =
@@ -83,8 +97,30 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
83 return rc; 97 return rc;
84} 98}
85 99
86void arch_gnttab_unmap_shared(struct grant_entry *shared, 100int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
87 unsigned long nr_gframes) 101 unsigned long max_nr_gframes,
102 grant_status_t **__shared)
103{
104 int rc;
105 grant_status_t *shared = *__shared;
106
107 if (shared == NULL) {
108 /* No need to pass in PTE as we are going to do it
109 * in apply_to_page_range anyhow. */
110 struct vm_struct *area =
111 alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
112 BUG_ON(area == NULL);
113 shared = area->addr;
114 *__shared = shared;
115 }
116
117 rc = apply_to_page_range(&init_mm, (unsigned long)shared,
118 PAGE_SIZE * nr_gframes,
119 map_pte_fn_status, &frames);
120 return rc;
121}
122
123void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
88{ 124{
89 apply_to_page_range(&init_mm, (unsigned long)shared, 125 apply_to_page_range(&init_mm, (unsigned long)shared,
90 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); 126 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f5e1362550e7..e03c63692176 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -173,9 +173,21 @@ static unsigned long __init xen_get_max_pages(void)
173 domid_t domid = DOMID_SELF; 173 domid_t domid = DOMID_SELF;
174 int ret; 174 int ret;
175 175
176 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); 176 /*
177 if (ret > 0) 177 * For the initial domain we use the maximum reservation as
178 max_pages = ret; 178 * the maximum page.
179 *
180 * For guest domains the current maximum reservation reflects
181 * the current maximum rather than the static maximum. In this
182 * case the e820 map provided to us will cover the static
183 * maximum region.
184 */
185 if (xen_initial_domain()) {
186 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
187 if (ret > 0)
188 max_pages = ret;
189 }
190
179 return min(max_pages, MAX_DOMAIN_PAGES); 191 return min(max_pages, MAX_DOMAIN_PAGES);
180} 192}
181 193
@@ -409,6 +421,6 @@ void __init xen_arch_setup(void)
409#endif 421#endif
410 disable_cpuidle(); 422 disable_cpuidle();
411 boot_option_idle_override = IDLE_HALT; 423 boot_option_idle_override = IDLE_HALT;
412 424 WARN_ON(set_pm_idle_to_default());
413 fiddle_vdso(); 425 fiddle_vdso();
414} 426}